In [None]:
import pandas as pd
import numpy as np

In [None]:
comments = pd.read_csv('Comments Corpus.csv')

In [None]:
comments.head()

In [None]:
import matplotlib.pyplot as plt
comment_lengths = comments['0'].str.len()
plt.hist(comment_lengths, bins=20, color='blue', alpha=0.7)
plt.xlabel('Comment Length')
plt.ylabel('Frequency')
plt.title('Distribution of Comment Lengths')
plt.grid(axis='y', linestyle='-', alpha=0.7)
plt.grid(axis='x', linestyle='-', alpha=0.7)
plt.show()


In [None]:
def plot_word_length_histogram(text):
    word_lengths = text.str.split().apply(lambda x: [len(i) for i in x])
    mean_word_lengths = word_lengths.map(lambda x: np.mean(x))
    plt.hist(mean_word_lengths, bins=20, color='green', alpha=0.7)
    plt.xlabel('Mean Word Length')
    plt.ylabel('Frequency')
    plt.title('Distribution of Mean Word Lengths')
    plt.grid(axis='y', linestyle='-', alpha=0.7)
    plt.grid(axis='x', linestyle='-', alpha=0.7)
    plt.show()

In [None]:
plot_word_length_histogram(comments['0'])

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords


In [None]:
stop = set(stopwords.words('english'))
from matplotlib import pyplot as plt

In [None]:
corpus = []
new = comments['0'].str.split()
new = new.values.tolist()
corpus= [word for i in new for word in i]

from collections import defaultdict
dic = defaultdict(int)
for word in corpus:
    if word in stop:
        dic[word]+=1

In [None]:
def plot_hist(dic):
    top = sorted(dic.items(), key = lambda x: x[1] , reverse = True)[:10]
    x , y = zip(*top)
    plt.bar(x , y)
    plt.ylabel('Frequency')
    plt.title('Top 10 most frequent words')

In [None]:
plot_hist(dic)

In [None]:
%pip install Counter

In [None]:
from collections import Counter
import seaborn as sns
counter = Counter(corpus)
most = counter.most_common()

x , y = [], []
for word, count in most[:40]:
    
    if(word not in stop):
        
        y.append(word)
        x.append(count)

# sns.barplot(x = x , y = y)
# plt.xlabel('Frequency')
# plt.title('most frequent words after removing stop words')

sns.set(style="whitegrid")
plt.figure(figsize=(7, 4))
sns.barplot(x=x, y=y)
plt.xlabel('Frequency', fontsize=14)
plt.ylabel('Words', fontsize=14)
plt.title('Most Frequent Words after Removing Stop Words', fontsize=16)


In [None]:
from nltk import ngrams
from sklearn.feature_extraction.text import CountVectorizer
def get_top_ngram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(n, n)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx])
                  for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:10]


In [None]:
top_n_bigrams=get_top_ngram(comments['0'],2)[:10]
x,y = map(list,zip(*top_n_bigrams)) 
sns.set(style="whitegrid")
plt.figure(figsize=(7, 4))
sns.barplot(x=y, y=x)
plt.xlabel('Frequency', fontsize=14)
plt.title('Top bi-grams', fontsize=16)

In [None]:
top_tri_grams = get_top_ngram(comments['0'] , n = 3)
y,x = map(list , zip(*top_tri_grams))
sns.set(style="whitegrid")
plt.figure(figsize=(7, 4))
sns.barplot(x=x, y=y)
plt.xlabel('Frequency', fontsize=14)
plt.title('Top tri-grams', fontsize=16)

In [None]:
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
from nltk.stem.porter import *
from nltk.stem.wordnet import *
from nltk.tokenize import word_tokenize

def preprocess(df):
    
    corpus = []
    stem = PorterStemmer()
    lem = WordNetLemmatizer()
    
    for comment in df['0']:
        
        words = [w for w in word_tokenize(comment) if (w not in stop)]
        words = [lem.lemmatize(w) for w in words if (len(w) > 2)]
        corpus.append(words)
        
    return corpus

In [None]:
corpus = preprocess(comments)

In [None]:
import gensim
dic = gensim.corpora.Dictionary(corpus)
bow_corpus = [dic.doc2bow(comm) for comm in corpus]

In [None]:
lda_model = gensim.models.LdaMulticore(bow_corpus,
                                   num_topics = 4,
                                   id2word = dic,
                                   passes = 10,
                                   workers = 2)
lda_model.show_topics()

In [None]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dic)
vis

In [None]:
from wordcloud import WordCloud, STOPWORDS

In [None]:
corpus

In [None]:
!python3 -m spacy download en_core_web_sm

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
doc = nlp("Diu was liberated from Portuguese rule by the Indian government in 1961")

In [None]:
[(x.text , x.label_) for x in doc.ents]

In [None]:
from spacy import displacy
displacy.render(doc , style = "ent")

In [None]:
def ner(text):
    
    doc = nlp(text)
    return [x.label_ for x in doc.ents]

In [None]:
ent = comments['0'].apply(lambda x: ner(x))


In [None]:
ner_map = {}
for sub in ent:
    for ner_type in sub:
        if(ner_type not in ner_map): ner_map[ner_type] = 0
        ner_map[ner_type]+= 1
ner_map = sorted(ner_map.items() , key = lambda x: x[1] , reverse = True)[:10]

In [None]:
ner_map

In [None]:
y , x = map(list , zip(*ner_map))
sns.barplot(x = x , y = y)

In [None]:
def get_top_ner(text , ent):
    doc = nlp(text)
    return [x.text for x in doc.ents if x.label_ == ent]

In [None]:
gpe = comments['0'].apply(lambda x: get_top_ner(x , "GPE"))

In [None]:
gpe_map = {}
for sub in gpe:
    for name in sub:
        if(name not in gpe_map): gpe_map[name] = 0
        gpe_map[name]+= 1
gpe_map = sorted(gpe_map.items() , key = lambda x: x[1] , reverse = True)[:10]

In [None]:
y , x = map(list , zip(*gpe_map))

In [None]:
sns.barplot(x = x , y = y)