In [1]:
import numpy as np
import pandas as pd
import re
import nltk
import string
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from nltk import ngrams
from collections import Counter
from nltk.stem import WordNetLemmatizer

from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
import pickle

# Data Cleaning and Extraction

In [None]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because",
                       "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not",
                       "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would",
                       "he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", 
                       "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have",
                       "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",
                       "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am",
                       "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have",
                       "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us",
                       "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not",
                       "mightn't've": "might not have", "must've": "must have", "mustn't": "must not",
                       "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have",
                       "o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not",
                       "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have",
                       "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have",
                       "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",
                       "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is",
                       "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is",
                       "they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have",
                       "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would",
                       "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have",
                       "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is",
                       "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",
                       "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",
                       "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",
                       "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",
                       "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                       "you'd": "you would", "you'd've": "you would have",
                       "you'll": "you will", "you'll've": "you will have",
                       "you're": "you are", "you've": "you have" }

In [None]:
def split_textnum(text):
    '''
    To seperate numbers from the words.
    
    Input - Word
    
    Returns - Number seperated list of items.
    '''
    match = re.match(r"([a-z]+)([0-9]+)", text, re.I)
    if match:
        items = " ".join(list(match.groups()))
    else:
        match = re.match(r"([0-9]+)([a-z]+)", text, re.I)
        if match:
            items = " ".join(list(match.groups()))
        else:
            return text
    return (items)

In [None]:
def clean_text(text): 
            
    # Special characters
    text = re.sub(r"%20", " ", text)
    #text = text.replace(r".", " ")
    text = text.replace(r"@", " ")
    text = text.replace(r"#", " ")
    #text = text.replace(r":", " ")
    text = text.replace(r"'", " ")
    text = text.replace(r"\x89û_", " ")
    text = text.replace(r"??????", " ")
    text = text.replace(r"\x89ûò", " ")
    text = text.replace(r"16yr", "16 year")
    text = text.replace(r"re\x89û_", " ")
    
    text = text.replace(r"mh370", " ")
    text = text.replace(r"prebreak", "pre break")
    text = re.sub(r"\x89û", " ", text)
    text = re.sub(r"re\x89û", "re ", text)
    text = text.replace(r"nowplaying", "now playing")
    text = re.sub(r"\x89ûª", "'", text)
    text = re.sub(r"\x89û", " ", text)
    text = re.sub(r"\x89ûò", " ", text)
    
    
    text = re.sub(r"\x89Û_", "", text)
    text = re.sub(r"\x89ÛÒ", "", text)
    text = re.sub(r"\x89ÛÓ", "", text)
    text = re.sub(r"\x89ÛÏWhen", "When", text)
    text = re.sub(r"\x89ÛÏ", "", text)
    text = re.sub(r"China\x89Ûªs", "China's", text)
    text = re.sub(r"let\x89Ûªs", "let's", text)
    text = re.sub(r"\x89Û÷", "", text)
    text = re.sub(r"\x89Ûª", "", text)
    text = re.sub(r"\x89Û\x9d", "", text)
    text = re.sub(r"å_", "", text)
    text = re.sub(r"\x89Û¢", "", text)
    text = re.sub(r"\x89Û¢åÊ", "", text)
    text = re.sub(r"fromåÊwounds", "from wounds", text)
    text = re.sub(r"åÊ", "", text)
    text = re.sub(r"åÈ", "", text)
    text = re.sub(r"JapÌ_n", "Japan", text)    
    text = re.sub(r"Ì©", "e", text)
    text = re.sub(r"å¨", "", text)
    text = re.sub(r"SuruÌ¤", "Suruc", text)
    text = re.sub(r"åÇ", "", text)
    text = re.sub(r"å£3million", "3 million", text)
    text = re.sub(r"åÀ", "", text)
    
    text = re.sub(r'http\S+', ' ', text)
    text = re.sub(r"ªs", " ", text)
    text = re.sub(r"ª", " ", text)
    text = re.sub(r"\x9d", " ", text)
    text = re.sub(r"ò", " ", text)
    text = re.sub(r"ªt", " ", text)
    text = re.sub(r"ó", " ", text)
    text = text.replace(r"11yearold", "11 year old")
    text = re.sub(r"typhoondevastated", "typhoon devastated", text)
    text = re.sub(r"bestnaijamade", "best nijamade", text)
    text = re.sub(r"gbbo", "The Great British Bake Off", text)
    text = re.sub(r"ï", "", text)
    text = re.sub(r"ïwhen", "when", text)
    text = re.sub(r"selfimage", "self image", text)
    text = re.sub(r"20150805", "2015 08 05", text)
    text = re.sub(r"20150806", "2015 08 06", text)
    text = re.sub(r"subreddits", "website for weird public sentiment", text)
    text = re.sub(r"disea", "chinese famous electronic company", text)
    text = re.sub(r"lmao", "funny", text)
    text = text.replace(r"companyse", "company")
    
    text = text.replace(r"worldnews", "world news")
    text = text.replace(r"animalrescue", "animal rescue")
    text = text.replace(r"freakiest", "freak")
    
    text = text.replace(r"irandeal", "iran deal")
    text = text.replace(r"directioners", "mentor")
    text = text.replace(r"justinbieber", "justin bieber")
    text = text.replace(r"okwx", "okay")
    text = text.replace(r"trapmusic", "trap music")
    text = text.replace(r"djicemoon", "music ice moon")
    text = text.replace(r"icemoon", "ice moon")
    text = text.replace(r"mtvhottest", "tv hottest")
    text = text.replace(r"rì©union", "reunion")
    text = text.replace(r"abcnews", "abc news")
    text = text.replace(r"tubestrike", "tube strike")
    text = text.replace(r"prophetmuhammad", "prophet muhammad muslim dharma")
    text = text.replace(r"chicagoarea", "chicago area")
    text = text.replace(r"yearold", "year old")
    text = text.replace(r"meatloving", "meat love")
    text = text.replace(r"standuser", "standard user")
    text = text.replace(r"pantherattack", "panther attack")
    text = text.replace(r"youngheroesid", "young hearos id")
    text = text.replace(r"idk", "i do not know")
    text = text.replace(r"usagov", "united state of america government")
    text = text.replace(r"injuryi", "injury")
    text = text.replace(r"summerfate", "summer fate")
    text = text.replace(r"kerricktrial", "kerrick trial")
    text = text.replace(r"viralspell", "viral spell")
    text = text.replace(r"collisionno", "collision")
    text = text.replace(r"socialnews", "social news")
    text = text.replace(r"nasahurricane", "nasa hurricane")
    text = text.replace(r"strategicpatience", "strategic patience")
    text = text.replace(r"explosionproof", "explosion proof")
    text = text.replace(r"selfies", "photo")
    text = text.replace(r"selfie", "photo")
    text = text.replace(r"worstsummerjob", "worst summer job")
    text = text.replace(r"realdonaldtrump", "real america president")
    text = text.replace(r"omfg", "oh my god")
    text = text.replace(r"japìn", "japan")
    text = text.replace(r"breakingnews", "breaking news")
    
    text = " ".join([split_textnum(word) for word in text.split(" ")])
    
    text = "".join([c if c not in string.punctuation else "" for c in text])
    text = ''.join(c for c in text if not c.isdigit())
    text = text.replace(r"÷", "")
    
    text = re.sub(' +', ' ', text)
    # text = text.encode('utf-8')
    return text

In [None]:
data = pd.read_csv("data/covid_related_tf_only.csv", encoding = "ISO-8859-1" ,index_col=False)

In [None]:
data

In [None]:

data['text_processed'] = data['tweet'].apply(lambda x : " ".join([contraction_mapping[word].lower() 
                    if word in contraction_mapping.keys() else word.lower() for word in x.split(" ")]))
# X_test['text_processed'] = X_test['tweet'].apply(lambda x : " ".join([contraction_mapping[word].lower() 
#                     if word in contraction_mapping.keys() else word.lower() for word in x.split(" ")]))
# X_train['text_processed'] = X_train['text_processed'].apply(lambda x : clean_text(x))
# X_test['text_processed'] = X_test['text_processed'].apply(lambda x : clean_text(x))

In [None]:
lemmatizer = WordNetLemmatizer()
data['text_processed'] = data['text_processed'].apply(lambda x : "".join([lemmatizer.lemmatize(word) 
                                                                            for word in x]))
# X_test['text_processed'] = X_test['text_processed'].apply(lambda x : "".join([lemmatizer.lemmatize(word) 
#                                                                           for word in x]))

In [None]:
# nltk.download('wordnet')

# Bag of Words

In [None]:
def cv(data):
    count_vectorizer = CountVectorizer()
    emb = count_vectorizer.fit_transform(data)
    return emb, count_vectorizer

# Tf-Idf

In [None]:
def tfidf(data):
    tfidf_vectorizer = TfidfVectorizer()
    train = tfidf_vectorizer.fit_transform(data)
    return train, tfidf_vectorizer

In [None]:
# y_train

# Training and Testing Data Splitting

In [None]:
len(data)

In [None]:
list_corpus = data["text_processed"].tolist()
list_labels = data["covid_related"].tolist()

X_train, X_test, y_train, y_test = train_test_split(list_corpus, list_labels, test_size=0.25)

In [None]:
len(X_test)

In [None]:

#Bag of Words embeddings
X_train_counts, count_vectorizer = cv(X_train)
X_test_counts = count_vectorizer.transform(X_test)

#TF-IDF embeddings
X_train_tfidf, tfidf_vectorizer = tfidf(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
count_vectorizer

In [None]:

pickle.dump(X_train_counts, open("data/train_count.pickle", "wb"))
pickle.dump(X_train_tfidf, open("data/train_tfidf.pickle", "wb"))

pickle.dump(X_test_counts, open("data/test_count.pickle", "wb"))
pickle.dump(X_test_tfidf, open("data/test_tfidf.pickle", "wb"))

pickle.dump(y_train, open("data/y_train.pickle", "wb"))
pickle.dump(y_test, open("data/y_test.pickle", "wb"))


pickle.dump(count_vectorizer, open("data/count_vectorizer.pickle", "wb"))
pickle.dump(tfidf_vectorizer, open("data/tfidf_vectorizer.pickle", "wb"))

# Data Training and Testing

In [3]:
X_train_counts = pickle.load(open("data/train_count.pickle", "rb"))
X_train_tfidf = pickle.load(open("data/train_tfidf.pickle", "rb"))

X_test_counts = pickle.load(open("data/test_count.pickle", "rb"))
X_test_tfidf = pickle.load(open("data/test_tfidf.pickle", "rb"))

y_train = pickle.load(open("data/y_train.pickle", "rb"))
y_test = pickle.load(open("data/y_test.pickle", "rb"))

In [4]:
X_test_counts.shape

(2523, 24063)

In [5]:
model = DecisionTreeClassifier(random_state=0)

In [6]:
model.fit(X_train_counts, y_train)

y_predict = model.predict(X_test_counts)

from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_predict)

0.9108204518430439

In [7]:
model.fit(X_train_tfidf, y_train)

y_predict = model.predict(X_test_tfidf)

from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_predict)

0.904478795085216

In [8]:
model = GaussianNB()


In [9]:
model.fit(X_train_counts.toarray(), y_train)

y_predict = model.predict(X_test_counts.toarray())

from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_predict)

0.816884661117717

In [10]:
model.fit(X_train_tfidf.toarray(), y_train)

y_predict = model.predict(X_test_tfidf.toarray())

from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_predict)

0.8164883075703527

In [11]:
model = SVC()

In [12]:
model.fit(X_train_counts, y_train)

y_predict = model.predict(X_test_counts.toarray())

from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_predict)

0.9270709472849782

In [13]:
model.fit(X_train_tfidf, y_train)

y_predict = model.predict(X_test_tfidf.toarray())

from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_predict)

0.9270709472849782

In [14]:
model = LinearSVC()

In [15]:
model.fit(X_train_counts, y_train)

y_predict = model.predict(X_test_counts)

from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_predict)

0.9207292905271502

In [16]:
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

       False       0.43      0.23      0.30       185
        True       0.94      0.98      0.96      2338

    accuracy                           0.92      2523
   macro avg       0.68      0.60      0.63      2523
weighted avg       0.90      0.92      0.91      2523



In [17]:
model.fit(X_train_tfidf, y_train)

y_predict = model.predict(X_test_tfidf.toarray())

from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_predict)

0.9318271898533492

In [18]:
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

       False       0.71      0.12      0.20       185
        True       0.93      1.00      0.96      2338

    accuracy                           0.93      2523
   macro avg       0.82      0.56      0.58      2523
weighted avg       0.92      0.93      0.91      2523



In [19]:
model = MLPClassifier()

In [20]:
model.fit(X_train_counts, y_train)

y_predict = model.predict(X_test_counts)

from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_predict)

0.9262782401902497

In [4]:
df_all = pd.read_csv("data/Data for joining/covid_related_full_ids.csv", engine="python", index_col=False)
df_9999 = pd.read_csv("data/Data for joining/tweets_to_label_Batch3_9999.csv", engine="python", index_col=False)
df_url = pd.read_csv("data/Data for joining/tweets_url_to_text.csv", engine="python", index_col=False)

In [13]:
df_merged=pd.merge(df_url.rename(columns={"tweet_id":"id"}), df_9999, on="id", how= "inner")

In [14]:
df_merged


Unnamed: 0,id,http_response_code,url_title,url_body,full_text,user_name
0,1222863488298565632,400,,,the government: coronavirus who ??? https://t....,“ vaguen. ”
1,1222438588555522048,200,Wuhan Coronavirus Infections | Scientist Warni...,,This scientist hopes to test coronavirus drugs...,NicoleReloaded🙋🏽‍♀
2,1221951156940754945,200,Coronavirus - Canada has advised to avoid all ...,Canada has advised Canadians not to travel to ...,Coronavirus - Canada has advised to avoid all ...,Skygains.com
3,1222428301920092165,200,Can the coronavirus be contained? Unknowns com...,Some early signs are discouraging: Six countri...,Can the coronavirus be contained? Unknowns com...,Walkirie
4,1221919177214242818,200,45 Million Chinese Now Under Quarantine As Off...,45 Million Chinese Now Under Quarantine As Off...,New story on NPR: 45 Million Chinese Now Under...,János Medenica
...,...,...,...,...,...,...
7149,1237505424829476864,400,,,This is what the status quo gets you. Healthca...,Eric Smith
7150,1242687185855246338,400,,,Perfect. Now release the funds 😈 https://t.co/...,🕵🏽‍♀️
7151,1244603588317458437,400,,,Humanity is in the gutter! How utterly depress...,Karen
7152,1239027328136531968,400,,,She is the reason why I am interested SO MUCH ...,ioo


In [15]:
df_all

Unnamed: 0,id,tweet,covid_related
0,1.221110e+18,60 is alot\r\n\r\n2nd Wuhan Coronavirus Case C...,TRUE
1,1.222670e+18,@CDCgov I think we're past the point of pointi...,FALSE
2,1.220430e+18,First Bats now snakes ! Get it right ! https:...,TRUE
3,1.221420e+18,"""Doctor on coronavirus: �We are more prepared�...",TRUE
4,1.223270e+18,Trump's Commerce Secretary Gets Dragged for Bo...,TRUE
...,...,...,...
10234,,Humanity is in the gutter! How utterly depress...,TRUE
10235,,One thing I can say about miss social distanci...,TRUE
10236,,"This Morning,Father was giving the communion u...",TRUE
10237,,She is the reason why I am interested SO MUCH ...,TRUE
