# Initial NLP analysis

```
 conda create --name NLP -c conda-forge python=3.10 jupyter pandas numpy matplotlib openpyxl textBlob nltk
 ```

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
# full data file with multiple sheets
filename = 'data/ITP_CourseArtifacts_June 2021_END_of_Course_DeIDENTIFIED.xlsx'

In [None]:
# sheet name for this analysis, containing responses to one question
sheet = 'Course Meta SelfEff'

In [None]:
df = pd.read_excel(filename, sheet)
df

## Look for n-grams

- NLTK (followed this): https://towardsdatascience.com/from-dataframe-to-n-grams-e34e29df3460
- textBlob (haven't tried) : https://levelup.gitconnected.com/simple-nlp-in-python-f5196db63aff


In [None]:
import unicodedata
import re

import nltk
from nltk.corpus import stopwords

In [None]:
# this only needs to be run once
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

In [None]:
# add appropriate words that will be ignored in the analysis
ADDITIONAL_STOPWORDS = ['1', '2', 'one', 'two', 'etc']

In [None]:
def basic_clean(text):
    """
    A simple function to clean up the data. All the words that
    are not designated as a stop word is then lemmatized after
    encoding and basic regex parsing are performed.
    
    from here : https://towardsdatascience.com/from-dataframe-to-n-grams-e34e29df3460
    """
    wnl = nltk.stem.WordNetLemmatizer()
    stopwords = nltk.corpus.stopwords.words('english') + ADDITIONAL_STOPWORDS
    text = (unicodedata.normalize('NFKD', text)
        .encode('ascii', 'ignore')
        .decode('utf-8', 'ignore')
        .lower())
    words = re.sub(r'[^\w\s]', '', text).split()
    return [wnl.lemmatize(word) for word in words if word not in stopwords]

In [None]:
# get all the words in order (excluding the stop words)
words = basic_clean(''.join(str(df[df.columns[1]].tolist())))
words[:10]

In [None]:
# get the bigrams
bigrams = pd.Series(nltk.ngrams(words, 2)).value_counts()
bigrams

In [None]:
# get the trigrams
trigrams = pd.Series(nltk.ngrams(words, 3)).value_counts()
trigrams

In [None]:
# plot the results

N = 20
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))

ind = np.arange(N)

bigrams_plot = bigrams[0:N].sort_values()
ax1.barh(ind, bigrams_plot, 0.9, color = 'gray')
ax1.set_yticks(ind)
_ = ax1.set_yticklabels(bigrams_plot.index.str.join(sep=' '))
_ = ax1.set_title(str(N) + ' Most Frequently Occuring Bigrams')
_ = ax1.set_xlabel('# of Occurances')

trigrams_plot = trigrams[0:N].sort_values()
ax2.barh(ind, trigrams_plot, 0.9, color = 'gray')
ax2.set_yticks(ind)
_ = ax2.set_yticklabels(trigrams_plot.index.str.join(sep=' '))
_ = ax2.set_title(str(N) + ' Most Frequently Occuring Trigrams')
_ = ax2.set_xlabel('# of Occurances')

plt.subplots_adjust(wspace = 0.9, left = 0.15, right = 0.99, top = 0.95, bottom = 0.07)

plt.savefig('ngrams.png')

## Topic modeling

- NLTK and gensim : https://towardsdatascience.com/nlp-extracting-the-main-topics-from-your-dataset-using-lda-in-minutes-21486f5aa925
- NLTK and gensim : https://towardsdatascience.com/introduction-to-nlp-part-5b-unsupervised-topic-model-in-python-ab04c186f295
- pyLDAvis : https://www.projectpro.io/article/10-nlp-techniques-every-data-scientist-should-know/415#toc-10
- pyLDAvis : https://neptune.ai/blog/pyldavis-topic-modelling-exploration-tool-that-every-nlp-data-scientist-should-know