# Import Libraries and Load Packages

In [1]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import os

In [2]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.svm import SVC
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.metrics import accuracy_score,classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

In [None]:
####################### Install Prerequesties ###########################
!pip install -r requirements.txt
!python -m spacy download en_core_web_sm

<hr style="border:2px solid gray">

## **STEP: 1/4** - Extract Text Features and use them in Classification Pipelines

* Training

In [4]:

# Let's load news group dataset. This dataset has 20 types of news groups

# More detailed documentation about fetch newsgroup dataset is here - https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html
data_newsgroups  = fetch_20newsgroups(subset='train') # Training subset of labelled data


# Split Dataset
X_train,X_test,y_train,y_test = train_test_split(data_newsgroups.data,data_newsgroups.target,test_size=0.3)

# Build Classification pipeline

svc_tfidf = Pipeline([('tf_idf_vectorizer',TfidfVectorizer(stop_words='english',max_features=5000)),
                      ('svc',SVC(kernel='linear'))])

scores = cross_val_score(svc_tfidf,X_train,y_train,cv=2).mean()
print('Score',scores)

Score 0.8301560375158505


* Testing


In [5]:
# Fit the model to data and get test predictions
svc_tfidf.fit(X_train,y_train)
preds = svc_tfidf.predict(X_test)

# Evaluate the model and determine the metrics
accuracy_score_ = accuracy_score(y_test,preds)
classification_report_ = classification_report(y_test,preds)

print('Accuracy Score', accuracy_score_)
print('Classification Report',classification_report_)

Accuracy Score 0.872459499263623
Classification Report               precision    recall  f1-score   support

           0       0.97      0.91      0.94       146
           1       0.66      0.79      0.72       153
           2       0.82      0.79      0.80       207
           3       0.72      0.80      0.76       197
           4       0.86      0.79      0.83       162
           5       0.85      0.85      0.85       189
           6       0.85      0.84      0.85       195
           7       0.84      0.86      0.85       188
           8       0.95      0.90      0.92       181
           9       0.93      0.94      0.94       178
          10       0.95      0.95      0.95       170
          11       0.95      0.93      0.94       164
          12       0.72      0.80      0.76       166
          13       0.91      0.91      0.91       155
          14       0.94      0.89      0.92       168
          15       0.90      0.94      0.92       186
          16       0.91   

<hr style="border:2px solid gray">

## **STEP: 2/4** - Latent Semantic Analysis(LSA) for Document Classification


In [8]:
# Let's use same dataset as above for document classification. But only consider 2 out of the 20 categories of newsgroups

categories = ['talk.religion.misc','comp.graphics']
newsgroups_train = fetch_20newsgroups(subset='train',categories=categories,remove=('headers','footers','quotes'))
newsgroups_test = fetch_20newsgroups(subset='test',categories=categories,remove=('headers','footers','quotes'))

In [9]:
# Vectorize and split data

vectorizer = TfidfVectorizer(max_df=0.5,stop_words='english',use_idf=True,max_features=5000)
X_train,X_test,y_train,y_test = train_test_split(newsgroups_train.data,newsgroups_train.target,test_size=0.3)
X_train_tfidf = vectorizer.fit_transform(X_train)

In [10]:
# Let's project tfidf vectors to principle components and determine topics
svd = TruncatedSVD(100)
lsa = make_pipeline(svd,Normalizer(copy=False))

# Project the training data to lower dimensions using SVD
X_train_lsa = lsa.fit_transform(X_train_tfidf)

# Let's apply transformations to testing data
X_test_tfidf = vectorizer.transform(X_test)
X_test_lsa = lsa.transform(X_test_tfidf)

In [11]:

# Build classifier model - e.g. KnnClassifier
knn_classify = KNeighborsClassifier()
knn_classify.fit(X_train_lsa,y_train)

# Get test set predictions and evaluate model metrics
preds = knn_classify.predict(X_test_lsa)
score = accuracy_score(y_test,preds)
classify_report = classification_report(y_test,preds)
print(score)
print(classify_report)

0.9065743944636678
              precision    recall  f1-score   support

           0       0.89      0.96      0.92       172
           1       0.93      0.83      0.88       117

    accuracy                           0.91       289
   macro avg       0.91      0.89      0.90       289
weighted avg       0.91      0.91      0.91       289



<hr style="border:2px solid gray">

## **STEP: 3/4** - Prepare Text


In [12]:
# We will now go back to the twitter dataset.

DATA_PATH = "COVID-19-Twitter-India/hourly_tweets/"
file_names_hourly = os.listdir(DATA_PATH)

In [18]:

# Mapping Files From Hourly to Daily Basis
file_names_daily = [file_name[:-7] for file_name in file_names_hourly]
file_names_df = pd.DataFrame({'Hourly' : file_names_hourly, 'Daily': file_names_daily})
file_names_df.head()


Unnamed: 0,Hourly,Daily
0,coronavirus-tweet-id-2020-03-01-08.csv,coronavirus-tweet-id-2020-03-01
1,coronavirus-tweet-id-2020-03-01-05.csv,coronavirus-tweet-id-2020-03-01
2,coronavirus-tweet-id-2020-03-01-04.csv,coronavirus-tweet-id-2020-03-01
3,coronavirus-tweet-id-2020-03-01-06.csv,coronavirus-tweet-id-2020-03-01
4,coronavirus-tweet-id-2020-03-01-07.csv,coronavirus-tweet-id-2020-03-01


In [19]:
# Identity any corrupt files 
def corrupt_or_not(file_name):
    """Some csv files are corrupt this is a program to spot them in DATA_PATH,
    return : True if opens False for corrupt(not open)"""
    try:
        pd.read_csv(os.path.join(*[DATA_PATH,file_name]))
        return False
    except:
        return True

file_names_df['Corrupt'] = file_names_df['Hourly'].apply(corrupt_or_not)
file_names_df.groupby('Corrupt').count()


Unnamed: 0_level_0,Hourly,Daily
Corrupt,Unnamed: 1_level_1,Unnamed: 2_level_1
False,9,9


In [20]:
# Removing Corrupt Files 
file_names_df = file_names_df[file_names_df['Corrupt'] == False]


In [21]:
# Converting the Groupby object to dict such that key is the day and values are the hourly file names
file_daily_hourly_map = file_names_df.groupby('Daily')['Hourly'].apply(list).to_dict()


In [22]:
file_daily_hourly_map

{'coronavirus-tweet-id-2020-03-01': ['coronavirus-tweet-id-2020-03-01-08.csv',
  'coronavirus-tweet-id-2020-03-01-05.csv',
  'coronavirus-tweet-id-2020-03-01-04.csv',
  'coronavirus-tweet-id-2020-03-01-06.csv',
  'coronavirus-tweet-id-2020-03-01-07.csv',
  'coronavirus-tweet-id-2020-03-01-03.csv',
  'coronavirus-tweet-id-2020-03-01-02.csv',
  'coronavirus-tweet-id-2020-03-01-00.csv',
  'coronavirus-tweet-id-2020-03-01-01.csv']}

In [23]:

def single_frame(file_names):
    "Concatenates all dataframe from a day and returns dataframe after fixing the full_text column"
    hourly_df = [pd.read_csv(os.path.join(*[DATA_PATH,file_name])) for file_name in file_names]
    daily_df = pd.concat(hourly_df)
    daily_df = daily_df[(daily_df['full_text'] != 'No Value Mentioned') | (daily_df['full_retweet_text'] != 'No Value Mentioned')]
    daily_df.loc[daily_df['full_text'] == 'No Value Mentioned','full_text'] =  daily_df.loc[daily_df['full_text'] == 'No Value Mentioned','full_retweet_text']
    daily_df['full_text'] = daily_df['full_text'].astype(str)
    return daily_df


final_df_tweets = pd.DataFrame()
final_retweet_text_updated = []

for key,file_names in tqdm(file_daily_hourly_map.items()):
  final_df_tweets = pd.concat([final_df_tweets,single_frame(file_names)],ignore_index=True)

for f in list(final_df_tweets['full_retweet_text']):
  t = type(f)
  if f!='No Value Mentioned' and t==str:
    final_retweet_text_updated.append(f)

100%|██████████| 1/1 [00:00<00:00, 14.83it/s]


<hr style="border:2px solid gray">

## **STEP: 4/4** - Implement a topic model using gensim library and interpret document topic distributions 

In [24]:

from gensim import corpora
from gensim.models import ldamodel
from gensim.parsing.preprocessing import STOPWORDS
from gensim.parsing.preprocessing import preprocess_string, strip_punctuation,strip_numeric

In [25]:
# List of all tweets
final_corpus = final_retweet_text_updated

# remove commond stopwords from each text in list of documents
texts = [[word for word in document.lower().split() if word not in STOPWORDS]
         for document in final_corpus]
all_tokens = sum(texts,[])

# remove duplicate tokens from set of words in each document of list
tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
final_corpus = [[word for word in text if word not in tokens_once]
         for text in texts]

# make a bag of words corpus 
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# print out the documents and which is the most probable topics for each doc.
lda = ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=20)
corpus_lda = lda[corpus]


lda_topics = lda.show_topics(num_words=5) # num_words signifies total number of words to represent each topic

topics = []
filters = [lambda x: x.lower(), strip_punctuation, strip_numeric]

for topic in lda_topics:
    print(topic)
    topics.append(preprocess_string(topic[1], filters))

print(topics)

(4, '0.044*"coronavirus" + 0.031*"breaking:" + 0.030*"state" + 0.027*"case" + 0.024*"washington"')
(11, '0.038*"virus" + 0.029*"corona" + 0.021*"#adoptvedicholipreventcorona" + 0.019*"cow-dung" + 0.019*"treatment"')
(1, '0.028*"coronavirus" + 0.027*"save" + 0.025*"death" + 0.024*"reports" + 0.021*"#coronavirus"')
(8, '0.019*"#covid19" + 0.015*"#coronavirus" + 0.012*"coronavirus" + 0.012*"lot" + 0.011*"wonder"')
(16, '0.048*"north" + 0.048*"cure" + 0.035*"korea" + 0.028*"coronavirus." + 0.025*"citizens"')
(9, '0.041*"coronavirus" + 0.017*"#coronavirus" + 0.016*"u.s." + 0.014*"case" + 0.014*"in:"')
(7, '0.019*"diplomats" + 0.019*"india" + 0.019*"china" + 0.011*"coronavirus" + 0.011*"open"')
(0, '0.038*"corona" + 0.034*"like" + 0.030*"1" + 0.027*"virus" + 0.026*"environment"')
(10, '0.022*"coronavirus" + 0.019*"trump" + 0.016*"travel" + 0.015*"chinese" + 0.014*"corona"')
(18, '0.032*"india" + 0.031*"iran" + 0.030*"china" + 0.026*"kabir" + 0.026*"all."')
[['coronavirus', 'breaking', 'state

In [None]:
import warnings
warnings.filterwarnings(action='ignore', category=DeprecationWarning)

In [None]:
# Visualization of topics
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda, corpus, dictionary=lda.id2word)

In [27]:
vis