# NLP Analysis Pipeline

```
conda create --name NLP -c conda-forge python=3.10 jupyter pandas numpy matplotlib openpyxl nltk gensim pyldavis spacy
```

In [None]:
## If you are running this for the first time on a new installation, uncomment below and run this cell
## (This only needs to be run once.)

# import nltk
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

# import spacy
# spacy.cli.download('en_core_web_sm')

In [None]:
# set to autoreload <-- only necessary while coding/debugging
%load_ext autoreload
%autoreload 2

from NLPforISP import *

In [None]:
# full data file with multiple sheets
filename = 'data/ITP_CourseArtifacts_June 2021_END_of_Course_DeIDENTIFIED.xlsx'

In [None]:
num_topics = np.arange(10) + 2

In [None]:
results1 = runNLPPipeline(filename, sheet = 'Course Meta SelfEff', column_number = 1, num_topics = num_topics,
    workers = 6, random_seed = 1234, n_answers = 20, n_sentences = 3,  
    kmeans_tfidf_ngram_range = (1,2), kmeans_tfidf_min_df = 0.001, kmeans_num_words_for_label = 10,
    #no_below = 15, no_above = 1, keep_n = int(1e5)
    #run_lda = False, run_ngrams = False, 
    coherence_method = "combined", cvals = ['c_v', 'u_mass'],
 )

In [None]:
results2 = runNLPPipeline(filename, sheet = 'Course Meta App', column_number = 1, num_topics = num_topics,
    workers = 6, random_seed = 1234, n_answers = 20, n_sentences = 3,
    kmeans_tfidf_ngram_range = (1,2), kmeans_tfidf_min_df = 0.001, kmeans_num_words_for_label = 10,
    #no_below = 15, no_above = 1, keep_n = int(1e5)
    #run_lda = False, run_ngrams = False, 
    coherence_method = "combined", cvals = ['c_v', 'u_mass'],
)

In [None]:
# combined
df1 = pd.read_excel(filename, 'Course Meta SelfEff')
df1.rename(columns = {df1.columns[1]: 'answer_text'}, inplace = True)

df2 = pd.read_excel(filename, 'Course Meta App')
df2.rename(columns = {df2.columns[1]: 'answer_text'}, inplace = True)

df = pd.concat([df1, df2])

# remove duplicates (there are a few)
df['answer_text'] = df['answer_text'].str.strip()
df = df.drop_duplicates(subset = ["answer_text"], keep = False).reset_index(drop = True) 

results_combined = runNLPPipeline(df = df, sheet = "combined", column_number = 1, num_topics = num_topics,
    workers = 6, random_seed = 1234, n_answers = 20, n_sentences = 3,
    kmeans_tfidf_ngram_range = (1,2), kmeans_tfidf_min_df = 0.001, kmeans_num_words_for_label = 10,
    coherence_method = "combined", cvals = ['c_v', 'u_mass'],
)

# Try with TF-IDF in LDA 

Reading online says this is not recommended ... and it gives strange results here.

In [None]:
results1_tfidf = runNLPPipeline(filename, sheet = 'Course Meta SelfEff', column_number = 1, num_topics = num_topics,
    workers = 6, random_seed = 1234, n_answers = 20, n_sentences = 3,  
    coherence_method = "combined", use_tfidf = {'lsi':True, 'lda':False},  cvals = ['c_v', 'u_mass'],
    run_kmeans = False, run_lda = True, run_lsi = False
)

In [None]:
results2_tfidf = runNLPPipeline(filename, sheet = 'Course Meta App', column_number = 1, num_topics = num_topics,
    workers = 6, random_seed = 1234, n_answers = 20, n_sentences = 3,
    coherence_method = "combined", use_tfidf = True, cvals = ['c_v', 'u_mass'],
    run_kmeans = False, run_lda = True, run_lsi = False
)

In [None]:
# combined
df1 = pd.read_excel(filename, 'Course Meta SelfEff')
df1.rename(columns = {df1.columns[1]: 'answer_text'}, inplace = True)

df2 = pd.read_excel(filename, 'Course Meta App')
df2.rename(columns = {df2.columns[1]: 'answer_text'}, inplace = True)

df = pd.concat([df1, df2])

# remove duplicates (there are a few)
df['answer_text'] = df['answer_text'].str.strip()
df = df.drop_duplicates(subset = ["answer_text"], keep = False).reset_index(drop = True) 

results_combined_tfidf = runNLPPipeline(df = df, sheet = "combined", column_number = 1, num_topics = num_topics,
    workers = 6, random_seed = 1234, n_answers = 20, n_sentences = 3,
    coherence_method = "combined", use_tfidf = True, cvals = ['c_v', 'u_mass'],
    run_kmeans = False, run_lda = True, run_lsi = False
)

## Visualize with pyLDAvis

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()

In [None]:
pyLDAvis.gensim_models.prepare(results1['lda']['model'][results1['lda']['best_index']], 
                               results1['bow_corpus'], results1['dictionary'])

In [None]:
pyLDAvis.gensim_models.prepare(results2['lda']['model'][results2['lda']['best_index']], 
                               results2['bow_corpus'], results2['dictionary'])