# # # # LIBRARIES NEEDED FOR THE PROGRAM # # # #

In [None]:
#General libraries, twitter and mongodb libraries
import tweepy as tpy
import pandas as pd
import numpy as np
import json
import pymongo

#Text preprocessing libraries
import preprocessor as p
from nltk.corpus import stopwords
from textblob import TextBlob
from nltk.stem import PorterStemmer
from textblob import Word

#Vectorizing and machine learning algorith libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import sklearn.feature_extraction.text as text
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, svm
from sklearn.linear_model import PassiveAggressiveClassifier, LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.pipeline import Pipeline

#Library for Sentiment Analysis
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

#Library for visualization
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter, defaultdict

#Library for WordCloud
from wordcloud import WordCloud, STOPWORDS

#Libraries for Analytics
from numpy.random import seed, randn
from scipy.stats import ttest_ind, stats, f_oneway, chi2_contingency

# # # # Gathering Data From Twitter API & Coverting to DataFrame # # # #

In [None]:
#Creating function to generate twitter API
def create_api():
    
    #Getting input from the user like:
    #Twitter Consumer Key
    consumer_key = input("Enter Consumer Key: ")
    #Twitter Consumer Secret Key
    consumer_secret = input("Enter Consumer Secret: ")
    #Twitter Access Token
    access_token = input("Enter Access Token: ")
    #Twitter Access Token Secret
    access_secret = input("Enter Access Secret: ")
        
    #Authenticating the twitter consumer key and secret using tweepy library
    authenticate = tpy.OAuthHandler(consumer_key, consumer_secret)
    #Authenticating the twitter access token and secret using tweepy library
    authenticate.set_access_token(access_token, access_secret)
    
    #Creating the twitter API using tweepy library
    api = tpy.API(authenticate)
    
    return api

In [None]:
api = create_api()

In [None]:
#Function to extract twitter data
def twitter_extraction(api):
    
    #Getting the clinet link for MongoDB connection from the user
    mongo_client_link = input("Please enter the connectivity link: ")
    
    #creating the connection with MongoDB using the pymongo library
    #passing the information to the object
    client = pymongo.MongoClient(mongo_client_link)
    #creating the database in MongoDB to store the raw data extracted from twitter API
    raw_tweet_db = client['raw_tweet_db']
    #creating the collection to store the extracted information inside the database
    raw_tweet_collection = raw_tweet_db['raw_tweet_collection']
    
    #Getting the search string from the user
    tweet_search = input("Please enter the tweet you want to retrieve: ")
    
    #Getting the total number of records that you want retrive from twitter
    tweet_max = int(input("Please enter how many records you want to retrieve: "))
    
    #Getting the category name for an additional column in the database
    category_name = input("Please enter category name: ")
    
    #Loop where it is retrieving the data from twitter and parsing through each records to save it in MongoDB
    for tweet in tpy.Cursor(api.search, q = tweet_search, lang = 'en',
                            exclude='retweets', tweet_mode = 'extended').items(tweet_max):
        
        #converting the retrieve data into dictionary format and storing into variable
        raw_tweets = dict(tweet._json)
        
        indx = list(raw_tweets.values())[1]
        
        print(json.dumps(raw_tweets, indent = 3))
        
        #inserting each record retrieved from the Twitter API in MongoDB database
        raw_tweet_collection.insert_one(raw_tweets)
        #adding a new column in the extsting database for category
        raw_tweet_collection.update_many({'id': indx},{"$set": {'category_name': category_name}})
        
    return raw_tweet_collection, mongo_client_link

In [None]:
rtc, mcl = twitter_extraction(api)

In [None]:
def data_selection(mcl, raw_tweet_collection):
    
    #creating the connection with MongoDB using the pymongo library
    #passing the information to the object
    client = pymongo.MongoClient(mcl)
    
    #Loop to retrieve selected columns from the raw data stored inside the database
    query = raw_tweet_collection.find({},{'_id':0, 'created_at':1, 'id':1, 'full_text':1, 
                                    'entities.hashtags.text':1, 'entities.user_mentions.screen_name':1, 
                                    'entities.user_mentions.id':1, 'user.id':1, 'user.name':1 , 'user.screen_name':1, 
                                    'user.location':1, 'user.protected':1, 'user.followers_count':1, 'user.friends_count':1, 
                                    'user.listed_count':1, 'user.created_at':1, 'user.favourites_count':1, 'user.statuses_count':1, 
                                    'retweeted_status.created_at':1, 'retweeted_status.id':1, 'retweeted_status.full_text':1,
                                    'retweeted_status.user_mentions.screen_name':1, 
                                    'retweet_count':1, 'favorite_count':1, 
                                    'possibly_sensitive':1, 'lang':1, 'category_name':1})
    
    #creating a new database in MongoDB to store the selected data extracted from MongoDB
    selected_tweet_columns_db = client['selected_tweet_columns_db']
    
    #creating the collection to store the selected information inside new collection
    selected_tweet_columns_collection = selected_tweet_columns_db['selected_tweet_columns_collection']
    
    #Loop to parse through the selected data and store inside new collection
    for q in query:
        
        #printing the selected data
        print(json.dumps(q, indent = 3))
        
        #inserting the selected tweets inside new collection in MongoDB Database
        selected_tweet_columns_collection.insert_one(q)
    
    return selected_tweet_columns_db, selected_tweet_columns_collection

In [None]:
stcd, stcc = data_selection(mcl, rtc)

In [None]:
#Function to load the data to pandas dataframe 
def load_data(selected_tweet_columns_db, selected_tweet_columns_collection, mongo_client_link):
    
    #creating the connection with MongoDB using the pymongo library
    #passing the information to the object
    client = pymongo.MongoClient(mongo_client_link)
    
    #assigining MongoDB database to a variable
    #mongo_db = selected_tweet_columns_db
    mongo_db = client['selected_tweet_columns_db']
    
    #assigning MongoDB database collection to a variable
    #collection = mongo_db.selected_tweet_columns_collection
    collection = mongo_db['selected_tweet_columns_collection']
    
    tweets_df = pd.json_normalize(collection.find({},{'_id':0}), max_level=2)
    
    #dropping all the duplicate rows from
    rd_tweets_df = tweets_df.drop_duplicates(subset = ['full_text'])
    
    #Extracting only the tweets from the exisisting dataframe and creating a new dataframe
    data = [rd_tweets_df['full_text'], rd_tweets_df['category_name']]
    header = ['content', 'category_name']
    new_tweet_df = pd.concat(data, axis = 1, keys = header)
    
    location = input("Please enter the path where you want to save the file: ")
    
    fname = input("Please enter the file name: ")
    
    #saving the dataframe as csv
    rd_tweets_df.to_csv(location+fname)
    
    return rd_tweets_df, new_tweet_df

In [None]:
original_tweet_df, modified_tweet_df = load_data(stcd, stcc, mcl)

In [None]:
original_tweet_df

In [None]:
modified_tweet_df

# # # # Training Data Import from MongoDB # # # # 

In [None]:
def load_trainingData(mongo_client_link):
#Making the connection between python and MongoDB

    client = pymongo.MongoClient(mongo_client_link)

    #Connecting to datbase and collection of MongoDB
    tclass = client['troll_classification']
    tcollec = tclass['collection_troll']

    #printing the connection to the collection from MongoDB
    print(tcollec)
    
    #Loading the records to pandas dataframe excluding the auto-generated id by MongoDB
    training_df = pd.json_normalize(tcollec.find({},{'_id':0, 'extras':0, 'annotation.notes': 0 }))
    
    #creating an empty list to store the troll label that we would be extracted from the array 
    troll_label = []

    for i in training_df['annotation.label']:
        for j in i:
            troll_label.append(j)
    training_df['Troll_label'] = troll_label
    
    training_df = training_df.drop(labels = 'annotation.label', axis=1)
    
    return training_df

In [None]:
training_df = load_trainingData(mcl)

In [None]:
training_df

# # # # PreProcessing of Text Data (Tweets) - NLP PreProcessing # # # #

In [None]:
abbr_dict={"dunno": "do not know", "wanna": "want to", "what's":"what is", "what're":"what are","who's":"who is","who're":"who are","where's":"where is",
    "where're":"where are","when's":"when is","when're":"when are","how's":"how is","how're":"how are",
    "i'm":"i am","we're":"we are","you're":"you are","they're":"they are","it's":"it is","he's":"he is",
    "she's":"she is","that's":"that is","there's":"there is","there're":"there are","i've":"i have","we've":"we have",
    "you've":"you have","they've":"they have","who've":"who have","would've":"would have","not've":"not have",
    "i'll":"i will","we'll":"we will","you'll":"you will","he'll":"he will","she'll":"she will",
    "it'll":"it will","they'll":"they will","I'll":"i will","isn't":"is not","wasn't":"was not","aren't":"are not","weren't":"were not",
    "can't":"can not","couldn't":"could not","don't":"do not","didn't":"did not","shouldn't":"should not",
    "wouldn't":"would not","doesn't":"does not","haven't":"have not","hasn't":"has not","hadn't":"had not",
    "won't":"will not","u":"you","ur":"your", "rolf": "rolling on floor laughing", "stfu": "shut the fuck up", 
    "icymi": "in case you missed it", "tl;dr": "too long, didn’t read", "lmk": "let me know","nvm": "nevermind",
    "tgif": "thank goodness it’s Friday", "tbh": "to be honest", "tbf": "to be frank", "rn": "right now",
    "qotd": "quote of the day", "brb": "be right back", "btw": "by the way", "lol": "laugh out loud", 
    "ttyl": "talk to you later", "hmu": "hit me up", "fwiw": "for what it’s worth",
    "imo": "in my opinion", "imho": "in my humble opinion", "idk": "i do not know", "tba": "to be announced",
    "tbd": "to be decided", "faq": "frequently asked question", "asap": "as soon as possible", 
    "aka": "also known as", "diy": "do it yourself", "np": "o problem", "ty": "thank you", "hifw": "how i feel when",
    "bts": "behind the scenes", "cmv": "change my view", "dyk": "did you know", "eli5" : "explain it to me like i am five",
    "ftw": "for the win", "irl": "in real life", "nbd": "no big deal", "oc": "original content", "tftf": "thanks for the follow",
    "tfw": "that feeling when", "tigf": "thank god it is friday", "f*ck": "fuck", "f***k": "fuck", "s**k": "suck",
    "b***h": "bitch", "b**ch": "bitch", "a**": "ass", "a**h*le": "asshole", "fu*k": "fuck", "sh*t": "shit", "s**t": "shit",
    "omg": "oh my god", "ily": "i love you", "lmao": "laughing my ass off", "wtf": "what the fuck", "ppl": "people",
    "thx": "thanks", "ffs": "for fuck's sake", "fml": "fuck my life", "stfu": "shut the fuck up", "jj": "just joking",
    "jk": "just kidding", "bff": "best friend forever", "ftw": "for the win", "txt": "text", "hbd": "happy birthday",
    "gtfo": "get the fuck out", "dgaf": "do not give a fuck", "dtf": "down to fuck", "smfh": "shaking my fucking head",
    "roflmao": "rolling on floor laughing my ass off", "ptfo": "passed the fuck out", "ttys": "talk to you soon",
    "fbo": "facebook official", "ttyn": "talk to you never", "b4": "before", "bae": "before anyone else", "btaim": "be that as it may",
    "cx": "customer experience", "dm": "direct message", "f2f": "face to face", "b2b": "business to business",
    "b2c": "business to customer", "fb": "facebook", "ftfy": "fixed that for you", "g2g": "got to go", "gr8": "great",
    "hmb": "hit me back", "hmu": "hit me up", "hth": "happy to help", "ianad": "i am not a doctor", "ianal": "i am not a lawyer",
    "idc": "i do not care", "ig": "instagram", "rss": "really simple syndication", "rt": "retweet", "motherf**ker": "motherfucker",
    "motherfu*cker": "motherfucker", "'em": "them", "ik": "i know", "what&;s":"what is", "what&;re":"what are",
    "who&;s":"who is","who&;re":"who are","where&;s":"where is",
    "where&;re":"where are","when&;s":"when is","when&;re":"when are","how&;s":"how is","how&;re":"how are",
    "i&;m":"i am","we&;re":"we are","you&;re":"you are","they&;re":"they are","it&;s":"it is","he&;s":"he is",
    "she&;s":"she is","that&;s":"that is","there&;s":"there is","there&;re":"there are","i&;ve":"i have","we&;ve":"we have",
    "you&;ve":"you have","they&;ve":"they have","who&;ve":"who have","would&;ve":"would have","not&;ve":"not have",
    "i&;ll":"i will","we&;ll":"we will","you&;ll":"you will","he&;ll":"he will","she&;ll":"she will",
    "it&;ll":"it will","they&;ll":"they will","I&;ll":"i will","isn&;t":"is not","wasn&;t":"was not",
    "aren&;t":"are not","weren&;t":"were not",
    "can&;t":"can not","couldn&;t":"could not","don&;t":"do not","didn&;t":"did not","shouldn&;t":"should not",
    "wouldn&;t":"would not","doesn&;t":"does not","haven&;t":"have not","hasn&;t":"has not","hadn&;t":"had not",
    "won&;t":"will not", "gonna": "got to", "gotcha": "i have got you", "d": "the", "n": "and", "amp": "and"}

In [None]:
#function to preprocess the text data
def nlp_preprocessing(dataframe):
    
    #Removing of URL, Mentions, Hastages, Reserved Words (RT and FAV), Emoji, Smiley and Number
    p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.HASHTAG, p.OPT.RESERVED, p.OPT.EMOJI, p.OPT.SMILEY, p.OPT.NUMBER)
    dataframe['content'] = dataframe['content'].apply(lambda x: " ".join(p.clean(x) for x in x.split()))
    
    #converting the contents in lower case
    dataframe['content'] = dataframe['content'].apply(lambda x: " ".join(x.lower() for x in x.split()))
    
    #normalizing short words
    dataframe['content'] = dataframe['content'].apply(lambda x: ' '.join([abbr_dict[x] if x in abbr_dict else x for x in x.split()]))
    
    #removing any character which is not alphabets from the string
    dataframe['content'] = dataframe['content'].apply(lambda x: " ".join(x for x in x.split() if x.isalpha()))
    
    #removing the stopwords from the contents
    stop = stopwords.words('english')
    dataframe['content'] = dataframe['content'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
    
    #lemmatization of the words from the content
    dataframe['content'] = dataframe['content'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
    
    return dataframe

In [None]:
clean_training_df = nlp_preprocessing(training_df)

In [None]:
clean_training_df

# # # # Splitting Data for Training and Testing and Vectorization# # # #

In [None]:
def train_test_split(dataframe):
    
    #randomizing the dataframe contents
    dataframe = dataframe.sample(frac = 1)
    
    #splitting the data into train and test
    train_x, valid_x, train_y, valid_y = model_selection.train_test_split(dataframe['content'], 
                                                                          dataframe['Troll_label'])
    
    return train_x, valid_x, train_y, valid_y

In [None]:
train_x, valid_x, train_y, valid_y = train_test_split(clean_training_df)

In [None]:
def data_vectorizing(dataframe, train_x, valid_x):
    
    #Vectorizing the data using TF-IDF vectorizer
    tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=10000)
    tfidf_vect.fit(dataframe['content'])
    #transforming the training and testing data
    xtrain_tfidf =  tfidf_vect.transform(train_x)
    xvalid_tfidf =  tfidf_vect.transform(valid_x)
    
    return xtrain_tfidf, xvalid_tfidf

In [None]:
xtrain_tfidf, xvalid_tfidf = data_vectorizing(clean_training_df, train_x, valid_x)

In [None]:
def newdata_vectorizing(dataframe):
    
    #Vectorizing the data using TF-IDF vectorizer
    tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=10000)
    tfidf_vect.fit(dataframe['content'])
    #transforming the dataframe
    tweet_tfidf =  tfidf_vect.transform(dataframe['content'])
    
    return tweet_tfidf

# # # # Model Training and Testing # # # # 

In [None]:
def custom_model_train_test(xtrain, ytrain, xtest, ytest):
    
    model = []
    
    multinomial = Pipeline([('m', naive_bayes.MultinomialNB(alpha=0.2))])
    model.append(('multinomial', multinomial))
    
    decision_tree = Pipeline([('m', DecisionTreeClassifier(random_state = 2))])
    model.append(('decisiontree', decision_tree))
    
    random_forest = Pipeline([('m', RandomForestClassifier())])
    model.append(('randomforest', random_forest))
    
    svc = Pipeline([('m', svm.SVC())])
    model.append(('svc', svc))
    
    passive_aggressive = Pipeline([('m', linear_model.PassiveAggressiveClassifier(C = 0.5, random_state = 5))])
    model.append(('passiveaggressive', passive_aggressive))
    
    ensemble = VotingClassifier(estimators = model, voting = 'hard')
    
    ensemble_fit = ensemble.fit(xtrain, ytrain)
    
    predictions = ensemble.predict(xtest)
    
    print("Prediction: ", predictions)
    
    print("Model Accuracy: ", accuracy_score(ytest, predictions)*100)
     
    print("Confusion Matrix: \n", confusion_matrix(ytest, predictions)) 
    
    return ensemble, ensemble_fit

In [None]:
ensemble, ensemble_fit = custom_model_train_test(xtrain_tfidf, train_y, xvalid_tfidf, valid_y)

In [None]:
#Testing new dataset extracted from twitter

def new_data_test(ensemble, ensemble_fit, xtest):
    
    #model fit values
    ensemble_fit
    
    #predicting if the tweet is troll or not
    predictions = ensemble.predict(xtest)

    return predictions

# # # # Sentiment Analysis # # # #

In [None]:
def sentiment_analysis(dataframe):
    
    #SentimentIntensityAnalyzer initialization
    analyser = SentimentIntensityAnalyzer()
    
    #creating an empty list to store the sentiment of the tweets
    sentiment_val = []
    polarity_score = []
    #extracting the tweets from the content column of the dataframe 
    for senti in dataframe['content']:
        
        #passing the tweets to the model to get the polarity score of the tweet
        sentiment_dict = analyser.polarity_scores(senti)   
        polarity_score.append(sentiment_dict['compound'])
        
        # # decide sentiment as positive, negative and neutral on the basis of compound score
        if sentiment_dict['compound'] >= 0.05 :
            val = 'Positive'
            sentiment_val.append(val)

        elif sentiment_dict['compound'] <= - 0.05 :
            val = 'Negative'
            sentiment_val.append(val)

        else :
            val = 'Neutral'
            sentiment_val.append(val)
    
    return sentiment_val, polarity_score

# # # # Working with Tweets # # # #

In [None]:
modified_clean_tweet_df = nlp_preprocessing(modified_tweet_df)

In [None]:
tweet_tfidf = newdata_vectorizing(modified_clean_tweet_df)

In [None]:
predictions = new_data_test(ensemble, ensemble_fit, tweet_tfidf)

In [None]:
predictions

In [None]:
sentiment_val, sentiment_polarity = sentiment_analysis(modified_clean_tweet_df)
print("Sentiment Value: ", sentiment_val)

In [None]:
print("\nSentiment Polarity: ", sentiment_polarity)

In [None]:
original_tweet_df['Troll_label'] = predictions
original_tweet_df['Text_Sentiment'] = sentiment_val
original_tweet_df['Sentiment_Polarity'] = sentiment_polarity

In [None]:
# original_tweet_df.to_csv("Tweets_classified.csv")
list(original_tweet_df.columns)

In [None]:
def top_not_stopwords_barplot(dataframe):
    
    stop = set(stopwords.words('english'))
    
    text = dataframe['content'].str.split()
    text = text.values.tolist()
    corpus = [word for i in text for word in i]

    counter = Counter(corpus)
    most = counter.most_common()
    
    words, counts = [], []
    
    for w,c in most[:10]:
        if (w not in stop):
            words.append(w)
            counts.append(c)
    
    plt.figure(figsize=(8,8))
            
    sns.barplot(x=counts,y=words)

In [None]:
top_not_stopwords_barplot(modified_clean_tweet_df)

In [None]:
def word_number_histogram(dataframe):
     dataframe['content'].str.split().\
        map(lambda x: len(x)).\
        hist(figsize = (8,8))

In [None]:
word_number_histogram(modified_clean_tweet_df)
#It is clear that the number of words in tweets ranges from 0 to 35 
#and mostly falls between 0 to 11 words as per the below chart

In [None]:
def top_stopwords_barchart(dataframe):
    stopword = set(stopwords.words('english'))
    
    text = dataframe['full_text'].str.split()
    text = text.values.tolist()
    corpus = [word for i in text for word in i]
    
    from collections import defaultdict
    
    dic = defaultdict(int)
    
    for word in corpus:
        if word in stopword:
            dic[word]+=1
            
    top=sorted(dic.items(), key=lambda x:x[1],reverse=True)[:10] 
    
    x,y=zip(*top)
    
    plt.subplots(figsize = (8,8))
    
    plt.bar(x,y)
    
    plt.show()

In [None]:
top_stopwords_barchart(original_tweet_df)

#As we can see from the below barplot that the most frequent 10 stopwords used are:
#"the, to, and, of, a, in, is, for, on and with" in the tweets

In [None]:
# creating a dataframe to see the total number of category available 
# the total number of tweets in each category

trct = original_tweet_df['category_name'].value_counts()

category_name = original_tweet_df['category_name'].unique()

records = []

for count in trct:
    records.append(count)

records

data = {'category': category_name, 'Total_Tweets': records}

trct_df = pd.DataFrame(data)

print("Total number of records in each category of tweets:\n\n", trct_df)

In [None]:
# Plotting pie chart from the above dataframe

fig1, ax1 = plt.subplots(figsize= (8,8))
ax1.pie(trct_df['Total_Tweets'], labels = trct_df['category'], autopct = '%1.1f%%', shadow=True, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()

In [None]:
#Trolls and Not Trolls tweets from each category

newDf = pd.DataFrame(original_tweet_df, columns=['category_name', 'Troll_label'])

a = newDf.groupby(['category_name', 'Troll_label'])[['Troll_label']].count()

tl=[]
not_troll=[]
troll=[]
for x in a['Troll_label']:
    tl.append(x)
for y in tl:
    if tl.index(y)%2 == 0:
        not_troll.append(y)
    else:
        troll.append(y)

info = {'category' : category_name, 'Not Troll': not_troll, 'Troll' : troll}

tntcc = pd.DataFrame(info)

In [None]:
tntcc

In [None]:
x = np.arange(len(tntcc['category']))

width = 0.35  # the width of the bars

fig, ax = plt.subplots(figsize = (8,8))
rects1 = ax.bar(x - width/2, tntcc['Not Troll'], width, label='Not Troll')
rects2 = ax.bar(x + width/2, tntcc['Troll'], width, label='Troll')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Scores')
ax.set_title('Troll and Not Troll by Category')
plt.xticks(x, rotation=90)
ax.set_xticklabels(tntcc['category'])
ax.legend(fontsize = 10)

def autolabel(rects):
    """Attach a text label above each bar in *rects*, displaying its height."""
    for rect in rects:
        height = rect.get_height()
        ax.annotate('{}'.format(height),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 10),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')


autolabel(rects1)
autolabel(rects2)

fig.tight_layout()

plt.show()

In [None]:
def word_cloud(dataframe):
    comment_words = ''
    
    cat_name = input("Please enter category name: ")

    text = dataframe.loc[dataframe['category_name'] == cat_name, 'full_text']
    
    p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.HASHTAG, p.OPT.RESERVED, p.OPT.EMOJI, 
                  p.OPT.SMILEY, p.OPT.NUMBER)
    text = text.apply(lambda x: " ".join(p.clean(x) for x in x.split()))
    
    text = text.apply(lambda x: " ".join(x for x in x.split() if x.isalpha()))
    
    text = text.apply(lambda x: ' '.join([abbr_dict[x] if x in abbr_dict else x for x in x.split()]))
    
    content = text.apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
    
    # iterate through the csv file
    for val in content:

        # typecaste each val to string
        val = str(val)
        stopwords = set(STOPWORDS)
        
        # split the value
        tokens = val.split()

        # Converts each token into lowercase
        for i in range(len(tokens)):
            tokens[i] = tokens[i].lower()

        comment_words += " ".join(tokens)+" "

    wordcloud = WordCloud(width = 800, height = 800,
                    background_color ='white',
                    min_font_size = 10).generate(comment_words)

    # plot the WordCloud image                       
    plt.figure(figsize = (8, 8), facecolor = None)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.tight_layout(pad = 0)

    plt.show()

In [None]:
word_cloud(original_tweet_df)

In [None]:
#Percentage calculation of Troll and Not Troll tweets
tntcc['Troll_percentage'] = ((tntcc['Troll'] / (tntcc['Troll'] + tntcc['Not Troll']))*100).round(decimals = 2)
tntcc['Not_Troll_percentage'] = ((tntcc['Not Troll'] / (tntcc['Troll'] + tntcc['Not Troll']))*100).round(decimals = 2)
tntcc.to_csv("troll_nottroll.csv")

In [None]:
#Creating dataframe of category_name, troll_label and reteet_count, followers_count
newDf1 = pd.DataFrame(original_tweet_df, 
                      columns=['category_name', 'Troll_label', 'retweet_count', 'user.followers_count',
                              'user.favourites_count', 'user.friends_count',
                               'user.statuses_count', 'favorite_count', 'Text_Sentiment', 'Sentiment_Polarity'])
newDf1['Troll_label'] = newDf1['Troll_label'].astype(int)
newDf1['retweet_count'] = newDf1['retweet_count'].astype(int)
newDf1['user.followers_count'] = newDf1['user.followers_count'].astype(int)
newDf1['user.favourites_count'] = newDf1['user.favourites_count'].astype(int)
newDf1['user.friends_count'] = newDf1['user.friends_count'].astype(int)
newDf1['user.followers_count'] = newDf1['user.followers_count'].astype(int)
newDf1['user.statuses_count'] = newDf1['user.statuses_count'].astype(int)
newDf1['favorite_count'] = newDf1['favorite_count'].astype(int)

In [None]:
#Getting the statistical description of the dataset
newDf1.describe().round(decimals = 2)

In [None]:
#Heatmap of co-relation of each category

plt.figure(figsize=(10,10))
sns.heatmap(newDf1.drop("Troll_label", axis=1).corr(), annot = True, vmin=-1, vmax=1, center= 2, cmap= 'rainbow', linewidths=2, linecolor='black')

In [None]:
#Checking Troll and Not Troll on the basis of Sentiment

ntts = original_tweet_df.loc[original_tweet_df['Troll_label'] == '0', 'Text_Sentiment']
tts = original_tweet_df.loc[original_tweet_df['Troll_label'] == '1', 'Text_Sentiment']

nt_pos, nt_neu, nt_neg, t_pos, t_neu, t_neg = [], [], [], [], [], []

Total_Not_Troll = tntcc['Not Troll'].sum()
Total_Troll = tntcc['Troll'].sum()

for i in ntts:
    if (i == 'Positive'):
        nt_pos.append(i)
    elif (i == 'Negative'):
        nt_neg.append(i)
    else:
        nt_neu.append(i)

for j in tts:
    if (j == 'Positive'):
        t_pos.append(j)
    elif (j == 'Negative'):
        t_neg.append(j)
    else:
        t_neu.append(j)

Troll_Pos = len(t_pos)
Troll_Neg = len(t_neg)
Troll_Neu = len(t_neu)
NTroll_Pos = len(nt_pos)
NTroll_Neg = len(nt_neg)
NTroll_Neu = len(nt_neu)

Pos = [NTroll_Pos, Troll_Pos]
Neg = [NTroll_Neg, Troll_Neg]
Neu = [NTroll_Neu, Troll_Neu]

List = [Pos, Neg, Neu]

stat, p, dof, expected = chi2_contingency(List)

print("Statistics: ", stat, "\np-Value: ", p, "\nDegree of Freedom: ", dof, "\nExpected Frequencies: \n", expected)

In [None]:
#Plotting of graph of the above data
x = np.arange(len(Pos))

width = 0.15  # the width of the bars

fig, ax = plt.subplots(figsize = (8,8))
rects1 = ax.bar(x - width/2, Pos, width, label='Positive', color = '#32cd32') # #32cd32 - Lime Green
rects2 = ax.bar(x + width/2, Neu, width, label='Neutral', color = '#fff44f') # #fff44f - Lime Yellow
rects3 = ax.bar(x + width*3/2, Neg, width, label='Negative', color = '#FF0000') # #FF0000 - Red

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Count')
ax.set_title('Troll and Not Troll by sentiment')
plt.xticks(x)
ax.set_xticklabels(['Non Troll', 'Troll'])
ax.legend(fontsize = 10)

def autolabel(rects):
    """Attach a text label above each bar in *rects*, displaying its height."""
    for rect in rects:
        height = rect.get_height()
        ax.annotate('{}'.format(height),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 10),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')


autolabel(rects1)
autolabel(rects2)
autolabel(rects3)

fig.tight_layout()

plt.show()

In [None]:
#Objective: To test whether retweet count for troll and non troll tweets are equal

# seed the random number generator
seed(1)

# generate two independent samples
data1 = newDf1.loc[newDf1['Troll_label'] == 0, 'retweet_count']
data2 = newDf1.loc[newDf1['Troll_label'] == 1, 'retweet_count']

# compare samples
stat, p = ttest_ind(data1, data2)

print('t = %.3f, p = %.3f' % (stat, p))

In [None]:
#Category wise retweet count 
#Objective: To test whether category wise retweet count differ significantly

Political_RT = newDf1.loc[newDf1['category_name'] == 'Political', 'retweet_count']
GI_RT = newDf1.loc[newDf1['category_name'] == 'Govt Institutions', 'retweet_count']
Gaming_RT = newDf1.loc[newDf1['category_name'] == 'Gaming', 'retweet_count']
IT_RT = newDf1.loc[newDf1['category_name'] == 'IT', 'retweet_count']
Pharma_RT = newDf1.loc[newDf1['category_name'] == 'Pharma', 'retweet_count']
Automobile_RT = newDf1.loc[newDf1['category_name'] == 'automobile', 'retweet_count']
MC_RT = newDf1.loc[newDf1['category_name'] == 'movie_celebrity', 'retweet_count']
MTS_RT = newDf1.loc[newDf1['category_name'] == 'Movie_TVShows', 'retweet_count']
Sports_RT = newDf1.loc[newDf1['category_name'] == 'Sports', 'retweet_count']

f_oneway(Political_RT, GI_RT, Gaming_RT, IT_RT, Pharma_RT, Automobile_RT, MC_RT, MTS_RT, Sports_RT)

In [None]:
#Category wise retweet count 
#Objective: To test whether category wise followers count differ significantly

Political_RT1 = newDf1.loc[newDf1['category_name'] == 'Political', 'user.followers_count']
GI_RT1 = newDf1.loc[newDf1['category_name'] == 'Govt Institutions', 'user.followers_count']
Gaming_RT1 = newDf1.loc[newDf1['category_name'] == 'Gaming', 'user.followers_count']
IT_RT1 = newDf1.loc[newDf1['category_name'] == 'IT', 'user.followers_count']
Pharma_RT1 = newDf1.loc[newDf1['category_name'] == 'Pharma', 'user.followers_count']
Automobile_RT1 = newDf1.loc[newDf1['category_name'] == 'automobile', 'user.followers_count']
MC_RT1 = newDf1.loc[newDf1['category_name'] == 'movie_celebrity', 'user.followers_count']
MTS_RT1 = newDf1.loc[newDf1['category_name'] == 'Movie_TVShows', 'user.followers_count']
Sports_RT1 = newDf1.loc[newDf1['category_name'] == 'Sports', 'user.followers_count']

f_oneway(Political_RT1, GI_RT1, Gaming_RT1, IT_RT1, Pharma_RT1, Automobile_RT1, MC_RT1, MTS_RT1, Sports_RT1)

In [None]:
#Category wise retweet count 
#Objective: To test whether category wise favourite count differ significantly

Political_RT1 = newDf1.loc[newDf1['category_name'] == 'Political', 'user.favourites_count']
GI_RT1 = newDf1.loc[newDf1['category_name'] == 'Govt Institutions', 'user.favourites_count']
Gaming_RT1 = newDf1.loc[newDf1['category_name'] == 'Gaming', 'user.favourites_count']
IT_RT1 = newDf1.loc[newDf1['category_name'] == 'IT', 'user.favourites_count']
Pharma_RT1 = newDf1.loc[newDf1['category_name'] == 'Pharma', 'user.favourites_count']
Automobile_RT1 = newDf1.loc[newDf1['category_name'] == 'automobile', 'user.favourites_count']
MC_RT1 = newDf1.loc[newDf1['category_name'] == 'movie_celebrity', 'user.favourites_count']
MTS_RT1 = newDf1.loc[newDf1['category_name'] == 'Movie_TVShows', 'user.favourites_count']
Sports_RT1 = newDf1.loc[newDf1['category_name'] == 'Sports', 'user.favourites_count']

f_oneway(Political_RT1, GI_RT1, Gaming_RT1, IT_RT1, Pharma_RT1, Automobile_RT1, MC_RT1, MTS_RT1, Sports_RT1)

In [None]:
#Objective: Chi-Square test of association. To test whether association between trolling and categories

df_association = tntcc.iloc[:, :3]

Nt = []
T = []
for nt in df_association['Not Troll']:
    Nt.append(nt)

for t in df_association['Troll']:
    T.append(t)

NT_T_list = [Nt, T]

stat, p, dof, expected = chi2_contingency(NT_T_list)

print("Statistics: ", stat, "\np-Value: ", p, "\nDegree of Freedom: ", dof, "\nExpected Frequencies: \n", expected)