# Import Libraries and Load Packages

In [1]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import os

In [2]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.svm import SVC
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.metrics import accuracy_score,classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

In [None]:
####################### Install Prerequesties ###########################
!pip install -r requirements.txt
!python -m spacy download en_core_web_sm

<hr style="border:2px solid gray">

## **STEP: 1/4** - Extract Text Features and use them in Classification Pipelines

* Training

In [None]:

# Let's load news group dataset. This dataset has 20 types of news groups

# More detailed documentation about fetch newsgroup dataset is here - https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html
data_newsgroups  = fetch_20newsgroups(subset='train') # Training subset of labelled data


# Split Dataset
X_train,X_test,y_train,y_test = train_test_split(data_newsgroups.data,data_newsgroups.target,test_size=0.3)

# Build Classification pipeline

svc_tfidf = 

scores = cross_val_score(svc_tfidf,X_train,y_train,cv=2).mean()
print('Score',scores)

* Testing


In [None]:
# Fit the model to data and get test predictions

preds = 

# Evaluate the model and determine the metrics
accuracy_score_ = accuracy_score(y_test,preds)
classification_report_ = classification_report(y_test,preds)

print('Accuracy Score', accuracy_score_)
print('Classification Report',classification_report_)

<hr style="border:2px solid gray">

## **STEP: 2/4** - Latent Semantic Analysis(LSA) for Document Classification


In [8]:
# Let's use same dataset as above for document classification. But only consider 2 out of the 20 categories of newsgroups

categories = ['talk.religion.misc','comp.graphics']
newsgroups_train = fetch_20newsgroups(subset='train',categories=categories,remove=('headers','footers','quotes'))
newsgroups_test = fetch_20newsgroups(subset='test',categories=categories,remove=('headers','footers','quotes'))

In [9]:
# Vectorize and split data

vectorizer = TfidfVectorizer(max_df=0.5,stop_words='english',use_idf=True,max_features=5000)
X_train,X_test,y_train,y_test = train_test_split(newsgroups_train.data,newsgroups_train.target,test_size=0.3)
X_train_tfidf = vectorizer.fit_transform(X_train)

In [10]:
# Let's project tfidf vectors to principle components and determine topics
svd = 
lsa = 

# Project the training data to lower dimensions using SVD
X_train_lsa = 

# Let's apply transformations to testing data
X_test_tfidf = 
X_test_lsa = 

In [None]:

# Build classifier model - e.g. KnnClassifier and fit to data
knn_classify = KNeighborsClassifier()
knn_classify.fit(X_train_lsa,y_train)

# Get test set predictions and evaluate model metrics
preds = knn_classify.predict(X_test_lsa)
score = accuracy_score(y_test,preds)
classify_report = classification_report(y_test,preds)
print(score)
print(classify_report)

<hr style="border:2px solid gray">

## **STEP: 3/4** - Prepare Text


In [12]:
# We will now go back to the twitter dataset.

DATA_PATH = "COVID-19-Twitter-India/hourly_tweets/"
file_names_hourly = os.listdir(DATA_PATH)

In [None]:

# Mapping Files From Hourly to Daily Basis
file_names_daily = [file_name[:-7] for file_name in file_names_hourly]
file_names_df = pd.DataFrame({'Hourly' : file_names_hourly, 'Daily': file_names_daily})
file_names_df.head()


In [None]:
# Identity any corrupt files 
def corrupt_or_not(file_name):
    """Some csv files are corrupt this is a program to spot them in DATA_PATH,
    return : True if opens False for corrupt(not open)"""
    try:
        pd.read_csv(os.path.join(*[DATA_PATH,file_name]))
        return False
    except:
        return True

file_names_df['Corrupt'] = file_names_df['Hourly'].apply(corrupt_or_not)
file_names_df.groupby('Corrupt').count()


In [20]:
# Removing Corrupt Files
file_names_df = file_names_df[file_names_df['Corrupt'] == False]


In [21]:
# Converting the Groupby object to dict such that key is the day and values are the hourly file names
file_daily_hourly_map = 


In [None]:
file_daily_hourly_map

In [None]:

def single_frame(file_names):
    "Concatenates all dataframe from a day and returns dataframe after fixing the full_text column"
    hourly_df = [pd.read_csv(os.path.join(*[DATA_PATH,file_name])) for file_name in file_names]
    daily_df = pd.concat(hourly_df)
    daily_df = daily_df[(daily_df['full_text'] != 'No Value Mentioned') | (daily_df['full_retweet_text'] != 'No Value Mentioned')]
    daily_df.loc[daily_df['full_text'] == 'No Value Mentioned','full_text'] =  daily_df.loc[daily_df['full_text'] == 'No Value Mentioned','full_retweet_text']
    daily_df['full_text'] = daily_df['full_text'].astype(str)
    return daily_df


final_df_tweets = pd.DataFrame()
final_retweet_text_updated = []

for key,file_names in tqdm(file_daily_hourly_map.items()):
  final_df_tweets = 
  
for f in list(final_df_tweets['full_retweet_text']):
  t = type(f)
  if f!='No Value Mentioned' and t==str:
    final_retweet_text_updated.append(f)

<hr style="border:2px solid gray">

## **STEP: 4/4** - Implement a topic model using gensim library and interpret document topic distributions 

In [24]:

from gensim import corpora
from gensim.models import ldamodel
from gensim.parsing.preprocessing import STOPWORDS
from gensim.parsing.preprocessing import preprocess_string, strip_punctuation,strip_numeric

In [None]:
# List of all tweets
final_corpus = final_retweet_text_updated

# remove commond stopwords from each text in list of documents
texts = [[word for word in document.lower().split() if word not in STOPWORDS]
         for document in final_corpus]
all_tokens = sum(texts,[])

# remove duplicate tokens from set of words in each document of list
tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
final_corpus = [[word for word in text if word not in tokens_once]
         for text in texts]

# make a bag of words corpus 
dictionary = 
corpus = 

# print out the documents and which is the most probable topics for each doc.
lda = 
corpus_lda =


lda_topics = 

topics = []
filters = [lambda x: x.lower(), strip_punctuation, strip_numeric]

for topic in lda_topics:
    print(topic)
    topics.append(preprocess_string(topic[1], filters))

print(topics)

In [None]:
import warnings
warnings.filterwarnings(action='ignore', category=DeprecationWarning)

In [None]:
# Visualization of topics
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda, corpus, dictionary=lda.id2word)

In [None]:
vis