In [None]:
# Read csv
import pandas as pd
df=pd.read_csv("CER_reasoning.csv",engine='python',encoding='cp1252',na_filter=False)

In [None]:
#Drop rows where 'Text' column has blanks
df = df[df['Text'].str.strip() != '']

In [None]:
import string
# Function to check if a string contains only punctuation
def is_only_punctuation(text):
    return all(char in string.punctuation for char in text)
# Remove rows where 'content' contains only punctuation
df = df[~df['Text'].apply(is_only_punctuation)]

In [None]:
#Remove special characters and punctuations
import regex
df['Text'] = df['Text'].str.replace('[^\w\s]','',regex=True)

In [None]:
# Remove punctuations, stopwords from 'content' and lemmatize text
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
data = df.Text.values.tolist()
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
data_words = list(sent_to_words(data))
print(data_words[:1])
import nltk 
nltk.download('words')
words = set(nltk.corpus.words.words())
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
#Add custom stop words in the list; for example stop_words.extend(['student', 'read', 'write'])
stop_words.extend([])
import spacy 
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

nlp = spacy.load('en_core_web_sm')
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags and len(token)>3])
    return texts_out
data_words_nostops = remove_stopwords(data_words)
data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'] )
print(data_lemmatized)

In [None]:
#Remove empty lists
filtered_list = [x for x in data_lemmatized if x]
# Remove commas and merge each inner list into a single string
cleaned_data = [" ".join(item).replace(",", "") for item in filtered_list]

In [None]:
#Generate unigrams
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range=(1, 1))
bag_of_words_unigrams = vectorizer.fit_transform(cleaned_data)
vectorizer.vocabulary_
sum_words = bag_of_words_unigrams.sum(axis=0) 
words_freq_uni = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
words_freq_uni =sorted(words_freq_uni, key = lambda x: x[1], reverse=True)
print (words_freq_uni[:100])

In [None]:
#Word cloud with unigrams
import matplotlib.pyplot as plt
from wordcloud import WordCloud
words_dict = dict(words_freq_uni)
WC_height = 1000
WC_width = 1500
WC_max_words = 200
wordCloud = WordCloud(max_words=WC_max_words, height=WC_height, width=WC_width)
wordCloud.generate_from_frequencies(words_dict)
plt.title('Most frequently occurring unigrams')
plt.imshow(wordCloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
#Generate bigrams
vectorizer = CountVectorizer(ngram_range=(2, 2))
bag_of_words = vectorizer.fit_transform(cleaned_data)
vectorizer.vocabulary_
sum_words = bag_of_words.sum(axis=0) 
words_freq_bi = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
words_freq_bi =sorted(words_freq_bi, key = lambda x: x[1], reverse=True)
print (words_freq_bi)

In [None]:
#Word cloud with bigrams
import matplotlib.pyplot as plt
from wordcloud import WordCloud
words_dict = dict(words_freq_bi)
WC_height = 1000
WC_width = 1500
WC_max_words = 200
wordCloud = WordCloud(max_words=WC_max_words, height=WC_height, width=WC_width)
wordCloud.generate_from_frequencies(words_dict)
plt.title('Most frequently occurring bigrams connected by same colour and font size')
plt.imshow(wordCloud, interpolation='bilinear')
plt.axis("off")
plt.show()