In [1]:
import pandas as pd
import numpy as np

#NLTK library used for NLP techniques
import nltk

#string library used for removing punctuation and stopwords
import string

#re is used for Regex operations
import re

#showing how many characters in a line
pd.set_option('display.max_colwidth', 300)   

#reading the final dataset
data = pd.read_csv("D:\\SJSU\\Sem 2\\CS271_ML\\project\\Dataset - Final Sheet (2).tsv", sep='\t', header=None)

#columns of the data frame
data.columns = ['username', 'tweet', 'label', 'random']
#data.head()

stopwords = nltk.corpus.stopwords.words('english')

'''
remove_extras implemented to remove the following:
1.) The hyperlinks and URLs in the tweet
2.) RT or retweet tag
3.) HTML character codes like &amp

Parameters: a list of tokens
Returns: a string with the removed elements
'''
def remove_extras(tokens):
    text_noUrl = []
    for word in tokens:
        if "https" in word or "http" in word or word == "RT" or word == "amp":
            continue
        text_noUrl.append(word)
    return text_noUrl

'''
remove_punctAndAlphanumeric implemented to remove the punctuations and other alphanumeric characters in the text

Parameters: a string
Returns: a string with the removed elements
''' 
def remove_punctAndAlphanumeric(text):
    text_noExtra = "".join([char for char in text if char not in string.punctuation and (char in string.ascii_letters or char in char in string.whitespace)])
    return text_noExtra


'''
remove_stop implemented to remove the stopwords except "IS" because "IS" is short form for ISIS

Parameters: a list of tokens
Returns: a list of tokens with stopword tokens removed
'''
def remove_stop(tokens):
    tokens_noStop = [word for word in tokens if word not in stopwords]
    return tokens_noStop

'''
lemmatize implemented to lemmatize words/reduce each word to its basic lemma or root

Parameters: a list of tokens
Returns: a list of tokens with lemmatized words
'''
wn = nltk.WordNetLemmatizer()
def lemmatize(tokens):
    lemma_tokens = [wn.lemmatize(word) for word in tokens]
    return lemma_tokens

'''
cleanAndTokenize implemented for the cleaning, tokenizing and lemmatizing the tweets

Parameters: a string (tweets)
Returns: a list of tokens (cleaned and lemmatized)
'''
def cleanAndTokenize(text):
    cleanedText = remove_punctAndAlphanumeric(text)
    tokens = re.split('\W+',cleanedText)
    tokens_removedExtras = remove_extras(tokens)
    tokens_removedStop = remove_stop(tokens_removedExtras)
    tokens_lemmatized = lemmatize(tokens_removedStop)
    return tokens_lemmatized

In [2]:
data['preprocessed_tweet'] = data['tweet'].apply(lambda x: cleanAndTokenize(x.lower()))

In [3]:
data['tweet_len'] = data['tweet'].apply(lambda x:len(x)-x.count(" "))
data.head(20)

Unnamed: 0,username,tweet,label,random,preprocessed_tweet,tweet_len
0,jxnatc,@RondaRousey Good luck tonight Rowdy! We're pulling for you! #NoToISIS #Prayers4Paris,0,0.166829,"[rondarousey, good, luck, tonight, rowdy, pulling, notoisis, prayersparis]",75
1,ashuneras,this is such a good question,0,0.791709,"[good, question]",23
2,CSULBASI,Have you ever wondered what major fits your extracurricular interests? Well now you can find out with one simple quiz! ‚†Ä ‚†Ä Take the quiz https://bit.ly/2XXpPGD,0,0.087267,"[ever, wondered, major, fit, extracurricular, interest, well, find, one, simple, quiz, take, quiz]",138
3,dqkidd,"#KSATFIESTA hello Leilana, Eli, Maxi, Aliyah, Carmen, Gabby, Ellie and paul watching the parade!!!!!! Quirogas &amp; Amayas Viva Fiesta!!!!",0,0.861475,"[ksatfiesta, hello, leilana, eli, maxi, aliyah, carmen, gabby, ellie, paul, watching, parade, quirogas, amayas, viva, fiesta]",121
4,Khen_Bee,"If you want to leave, take good care. Hope you have a lot of nice things to wear, but then a lot of nice things turn bad out there.",0,0.514142,"[want, leave, take, good, care, hope, lot, nice, thing, wear, lot, nice, thing, turn, bad]",103
5,douglvslee,Regina Hall was my first girl crush. her character in Malibu‚Äôs Most Wanted did something to my lil hormones.,0,0.237339,"[regina, hall, first, girl, crush, character, malibus, wanted, something, lil, hormone]",92
6,lilwolf___,how do girls take cute tongue out pics??? I look like I‚Äôm ab to slob on a knob when I do that shit,0,0.991494,"[girl, take, cute, tongue, pic, look, like, im, ab, slob, knob, shit]",78
7,thebabydoll__,So did he really pass or what ?,0,0.362523,"[really, pas, ]",24
8,abubakerdimshqi,from the Heart love and respect for all #MUJAHDIN in BILAD #ALSHAM Brothers&amp;Sisters you are our hope to lead us to right way #IS,1,0.041372,"[heart, love, respect, mujahdin, bilad, alsham, brothersampsisters, hope, lead, u, right, way]",109
9,robertmanoin,you misspelled RED omg,0,0.437904,"[misspelled, red, omg]",19


In [4]:
radical_keywords = ['KUFFUR', 'IS', 'ISLAMIC STATE', '#ILOVEISIS', 'KUFFARS', 'mujahiddeen',
                    'kuffur', 'KUFFAR', 'kuffars', 'kuffar', 'kafir', 'MUJAHIDEEN', 'KUFAR', 'KAFIR',
                    'KUFR', 'mujahideen', '#IS', 'kufar', 'kufr', 'mujahid', 'JIHAD', 'jihad', 'MUJAHID',
                    'MUJAHIDDEEN', '#ISIS', 'Islamic State', '#ILoveISIS', 'ISIL', 'allah', 'Allah', 'Assad', 'assad', 'PKK',
                    'YPG', '#AleppoIsBurning', 'Aleppo', 'martydom', 'Martyrdom']

def radicalWordPresence(text):
    for key in radical_keywords:
        if key in text:
            return 1
    return 0
data['rad_key'] = data['tweet'].apply(lambda x:radicalWordPresence(x))
data.head(20)

Unnamed: 0,username,tweet,label,random,preprocessed_tweet,tweet_len,rad_key
0,jxnatc,@RondaRousey Good luck tonight Rowdy! We're pulling for you! #NoToISIS #Prayers4Paris,0,0.166829,"[rondarousey, good, luck, tonight, rowdy, pulling, notoisis, prayersparis]",75,1
1,ashuneras,this is such a good question,0,0.791709,"[good, question]",23,0
2,CSULBASI,Have you ever wondered what major fits your extracurricular interests? Well now you can find out with one simple quiz! ‚†Ä ‚†Ä Take the quiz https://bit.ly/2XXpPGD,0,0.087267,"[ever, wondered, major, fit, extracurricular, interest, well, find, one, simple, quiz, take, quiz]",138,0
3,dqkidd,"#KSATFIESTA hello Leilana, Eli, Maxi, Aliyah, Carmen, Gabby, Ellie and paul watching the parade!!!!!! Quirogas &amp; Amayas Viva Fiesta!!!!",0,0.861475,"[ksatfiesta, hello, leilana, eli, maxi, aliyah, carmen, gabby, ellie, paul, watching, parade, quirogas, amayas, viva, fiesta]",121,0
4,Khen_Bee,"If you want to leave, take good care. Hope you have a lot of nice things to wear, but then a lot of nice things turn bad out there.",0,0.514142,"[want, leave, take, good, care, hope, lot, nice, thing, wear, lot, nice, thing, turn, bad]",103,0
5,douglvslee,Regina Hall was my first girl crush. her character in Malibu‚Äôs Most Wanted did something to my lil hormones.,0,0.237339,"[regina, hall, first, girl, crush, character, malibus, wanted, something, lil, hormone]",92,0
6,lilwolf___,how do girls take cute tongue out pics??? I look like I‚Äôm ab to slob on a knob when I do that shit,0,0.991494,"[girl, take, cute, tongue, pic, look, like, im, ab, slob, knob, shit]",78,0
7,thebabydoll__,So did he really pass or what ?,0,0.362523,"[really, pas, ]",24,0
8,abubakerdimshqi,from the Heart love and respect for all #MUJAHDIN in BILAD #ALSHAM Brothers&amp;Sisters you are our hope to lead us to right way #IS,1,0.041372,"[heart, love, respect, mujahdin, bilad, alsham, brothersampsisters, hope, lead, u, right, way]",109,1
9,robertmanoin,you misspelled RED omg,0,0.437904,"[misspelled, red, omg]",19,0


In [5]:
X = data[['tweet', 'tweet_len', 'rad_key']]
y = data['label']

In [6]:
#Using Tfidf Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(analyzer=cleanAndTokenize)
tfidf_vect_fit = tfidf_vect.fit(X['tweet'])

tfidf_train = tfidf_vect_fit.transform(X['tweet'])
#tfidf_test = tfidf_vect_fit.transform(X_test['tweet'])

X_train_vect = pd.concat([X[['tweet_len', 'rad_key']].reset_index(drop=True), 
           pd.DataFrame(tfidf_train.toarray())], axis=1)
#X_test_vect = pd.concat([X_test[['tweet_len', 'rad_key']].reset_index(drop=True), 
           #pd.DataFrame(tfidf_test.toarray())], axis=1)

X_train_vect.tail(30)

Unnamed: 0,tweet_len,rad_key,0,1,2,3,4,5,6,7,...,14026,14027,14028,14029,14030,14031,14032,14033,14034,14035
5648,11,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5649,26,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5650,42,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5651,57,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5652,111,1,0.0,0.214049,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5653,86,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5654,134,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5655,121,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5656,89,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5657,52,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import precision_recall_fscore_support as score
import time

In [9]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
#from sklearn.tree import DecisionTreeClassifier

#dt = DecisionTreeClassifier(max_depth=5, random_state=0)
sv = SVC()
param = {
    'kernel': ['linear','rbf'],
    'C': [1,100]
}

gs = GridSearchCV(sv, param, cv=5, n_jobs=-1)
cv_fit = gs.fit(X_train_vect,y)
#pd.DataFrame(cv_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]
print(cv_fit.best_params_)

{'C': 100, 'kernel': 'linear'}


In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
#from sklearn.tree import DecisionTreeClassifier

#dt = DecisionTreeClassifier(max_depth=5, random_state=0)
rf = RandomForestClassifier()
param = {
    'n_estimators': [10,50,100,200],
    'max_depth': [None,10,20,30]
}

gs = GridSearchCV(rf, param, cv=5, n_jobs=-1)
cv_fit = gs.fit(X_train_vect,y)
pd.DataFrame(cv_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
2,38.610945,1.963792,0.424268,0.085037,,100,"{'max_depth': None, 'n_estimators': 100}",0.943662,0.944542,0.93662,0.940969,0.937445,0.940648,0.003189,1
3,71.644373,1.071569,0.445915,0.051999,,200,"{'max_depth': None, 'n_estimators': 200}",0.93838,0.941021,0.940141,0.940969,0.940969,0.940296,0.001013,2
1,23.210016,0.452095,0.340888,0.045012,,50,"{'max_depth': None, 'n_estimators': 50}",0.9375,0.941021,0.941901,0.940969,0.937445,0.939768,0.001903,3
0,8.617194,0.398894,0.262897,0.028806,,10,"{'max_depth': None, 'n_estimators': 10}",0.929577,0.926056,0.933979,0.93304,0.915419,0.927615,0.006706,4
12,2.975496,0.44847,0.199075,0.006026,30.0,10,"{'max_depth': 30, 'n_estimators': 10}",0.895246,0.888204,0.892606,0.888987,0.869604,0.886932,0.009026,5


In [15]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(max_depth=5, random_state=0)
rf = AdaBoostClassifier(base_estimator=dt)
param = {
    'n_estimators': [50,100,200],
    'learning_rate': [0.1,0.5,0.05]
}

gs = GridSearchCV(rf, param, cv=5, n_jobs=-1)
cv_fit = gs.fit(X_train_vect,y)
pd.DataFrame(cv_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
4,300.254349,4.75744,11.577173,0.351197,0.5,100,"{'learning_rate': 0.5, 'n_estimators': 100}",0.961268,0.950704,0.951585,0.962115,0.955947,0.956323,0.004736,1
5,596.51745,21.358381,18.816186,0.202508,0.5,200,"{'learning_rate': 0.5, 'n_estimators': 200}",0.958627,0.958627,0.954225,0.951542,0.957709,0.956147,0.002815,2
3,151.134877,1.471691,6.183921,0.450688,0.5,50,"{'learning_rate': 0.5, 'n_estimators': 50}",0.963028,0.957746,0.952465,0.947137,0.956828,0.955442,0.005341,3
6,140.445979,1.979525,5.511299,0.120386,0.05,50,"{'learning_rate': 0.05, 'n_estimators': 50}",0.957746,0.953345,0.950704,0.955066,0.959471,0.955266,0.003111,4
0,138.494406,0.729756,6.333343,0.086927,0.1,50,"{'learning_rate': 0.1, 'n_estimators': 50}",0.961268,0.953345,0.950704,0.950661,0.956828,0.954561,0.004042,5


In [10]:
#Grid Search KNeighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

kn = KNeighborsClassifier()
param = {
    'n_neighbors': [3,5,11,19],
    'weights': ['uniform','distance'],
    'metric':['euclidean','manhattan']
}

gs = GridSearchCV(kn, param, cv=5, n_jobs=-1)
cv_fit = gs.fit(X_train_vect,y)
pd.DataFrame(cv_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_metric,param_n_neighbors,param_weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
5,11.454158,1.269725,10.475139,0.636509,euclidean,11,distance,"{'metric': 'euclidean', 'n_neighbors': 11, 'weights': 'distance'}",0.889085,0.889965,0.894366,0.896035,0.888106,0.891511,0.003114,1
13,12.331219,1.587305,14.496003,0.551636,manhattan,11,distance,"{'metric': 'manhattan', 'n_neighbors': 11, 'weights': 'distance'}",0.888204,0.883803,0.897887,0.89163,0.888987,0.890102,0.004637,2
15,10.977269,1.049059,10.192719,0.973942,manhattan,19,distance,"{'metric': 'manhattan', 'n_neighbors': 19, 'weights': 'distance'}",0.889085,0.881162,0.897007,0.893392,0.887225,0.889574,0.005411,3
3,13.339245,0.780353,10.482745,0.746322,euclidean,5,distance,"{'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}",0.883803,0.888204,0.896127,0.889868,0.888106,0.889222,0.003994,4
4,14.376064,2.023098,11.384993,0.711847,euclidean,11,uniform,"{'metric': 'euclidean', 'n_neighbors': 11, 'weights': 'uniform'}",0.883803,0.885563,0.893486,0.893392,0.888106,0.888869,0.003974,5
