In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
import time
from nltk.tokenize import TweetTokenizer
from gensim.parsing.porter import PorterStemmer

from sklearn.tree import DecisionTreeClassifier


In [3]:
tweets_df = pd.read_csv('/Users/arjunkhanchandani/Desktop/twitter_data_analysis-main/data/final_manual_priority.csv')
tweets_df.shape

(198, 19)

In [4]:
tweets_df.head()

Unnamed: 0,S.no.,user_id,tweet_id,username,location,following,followers,twt_created_at,total_tweets,retweet_count,text,hashtags,mentions,textblob_polarity,nltk_compound,avg_sentiment,textblob_sentiment,nltk_sentiment,priority
0,4,11,11,UtkarshMishra_9,"Noida, India",707,1122,2022-11-08 21:14:55+00:00,5764,0,estimated magnitude earthquake affected countr...,"[{'text': 'earthquake', 'indices': [137, 148]}...","[{'screen_name': 'ANI', 'name': 'ANI', 'id': 3...",0.0,-0.1531,-1,0,-1,0
1,8,18,18,GirjeshKPatel,"‚Ä°¬ß‚â§‚Ä°¬ß√±‚Ä°¬ß¬Æ‚Ä°¬ß√§, ‚Ä°¬ß‚â†‚Ä°¬ß√¶...",164,988,2022-11-08 20:54:48+00:00,522,0,heavy roorke uttrakhand second horrible moment,"[{'text': 'earthquake', 'indices': [6, 17]}]","[{'screen_name': 'ZeeNews', 'name': 'Zee News'...",-0.4,-0.5423,-1,-1,-1,1
2,10,25,25,TheAnantpandit,"New Delhi, India",409,19,2022-11-08 20:46:16+00:00,152,0,earthquake magnitude occurred ist lat long dep...,[],"[{'screen_name': 'Indiametdept', 'name': 'Indi...",-0.05,0.0,-1,-1,0,0
3,11,31,31,kanhagupta21,Allahabad,182,13,2022-11-08 20:38:44+00:00,73,0,horrible ended running outside home safe,[],"[{'screen_name': 'ANI', 'name': 'ANI', 'id': 3...",-0.166667,-0.1531,-1,-1,-1,1
4,13,42,42,Kunalgupta_voi,,21,1,2022-11-08 20:10:38+00:00,1,0,choking wakeup antismog gun installed watering...,"[{'text': 'DelhiPollution', 'indices': [253, 2...","[{'screen_name': 'ArvindKejriwal', 'name': 'Ar...",0.0,-0.6597,-1,0,-1,1


In [5]:
# dropping user_id, username, location, following, followers, twt_created_at, total_tweets, retweet_count, hashtags, mentions, tweet_id_dup
tweets_df.drop(['S.no.', 'user_id', 'username', 'location', 'following', 'followers', 'twt_created_at', 'total_tweets', 'retweet_count', 'hashtags', 'mentions', 'textblob_polarity', 'nltk_compound', 'textblob_sentiment', 'nltk_sentiment', 'avg_sentiment'], axis=1, inplace=True)

In [6]:
tweets_df.columns

Index(['tweet_id', 'text', 'priority'], dtype='object')

In [7]:
tweets_df.head()

Unnamed: 0,tweet_id,text,priority
0,11,estimated magnitude earthquake affected countr...,0
1,18,heavy roorke uttrakhand second horrible moment,1
2,25,earthquake magnitude occurred ist lat long dep...,0
3,31,horrible ended running outside home safe,1
4,42,choking wakeup antismog gun installed watering...,1


In [8]:
tweets_df['priority'].value_counts()

1    109
0     89
Name: priority, dtype: int64

In [9]:
def creating_tokens(df):
    tokens = list()
    tokenizer = TweetTokenizer()
    
    for tweets in df.loc[:, 'text']:
        # print(len(tokenizer.tokenize(tweets)))
        tokens.append(tokenizer.tokenize(tweets))
    
    df['tokens'] = tokens
    
    porter_stemmer = PorterStemmer()
    # Get the stemmed_tokens
    df['stemmed_tokens'] = [[porter_stemmer.stem(word) for word in tokens] for tokens in df['tokens']]
    df['stemmed_tokens'].head(10)
    
    
    return df

In [10]:
tweets_df = creating_tokens(tweets_df)
tweets_df.head()

Unnamed: 0,tweet_id,text,priority,tokens,stemmed_tokens
0,11,estimated magnitude earthquake affected countr...,0,"[estimated, magnitude, earthquake, affected, c...","[estim, magnitud, earthquak, affect, countri, ..."
1,18,heavy roorke uttrakhand second horrible moment,1,"[heavy, roorke, uttrakhand, second, horrible, ...","[heavi, roork, uttrakhand, second, horribl, mo..."
2,25,earthquake magnitude occurred ist lat long dep...,0,"[earthquake, magnitude, occurred, ist, lat, lo...","[earthquak, magnitud, occur, ist, lat, long, d..."
3,31,horrible ended running outside home safe,1,"[horrible, ended, running, outside, home, safe]","[horribl, end, run, outsid, home, safe]"
4,42,choking wakeup antismog gun installed watering...,1,"[choking, wakeup, antismog, gun, installed, wa...","[choke, wakeup, antismog, gun, instal, water, ..."


In [11]:
from sklearn.model_selection import train_test_split

def split_data(df, test_size):
    x_train, x_test, y_train, y_test = train_test_split(df['stemmed_tokens'], df['priority'], test_size=test_size, random_state=42, stratify=df['priority'])
    
    print(y_train.value_counts())
    print(y_test.value_counts())
    # print(type(x_train))
    # print(type(y_train))
    
    x_train = x_train.to_frame()
    x_train = x_train.reset_index()
    
    x_test = x_test.to_frame()
    x_test = x_test.reset_index()
    
    y_train = y_train.to_frame()
    y_train = y_train.reset_index()
    
    y_test = y_test.to_frame()
    y_test = y_test.reset_index()
    
    return x_train, x_test, y_train, y_test

In [12]:
x_train, x_test, y_train, y_test = split_data(tweets_df, 0.3)

1    76
0    62
Name: priority, dtype: int64
1    33
0    27
Name: priority, dtype: int64


In [13]:
print(x_train.head())
print(x_test.head())
print(y_train.head())
print(y_test.head())

   index                                     stemmed_tokens
0    164  [forgiv, rapist, murder, cruel, enter, mandir,...
1     36  [socha, puch, illeg, resort, demolish, still, ...
2     92  [todai, whole, year, complet, dai, written, pa...
3     42  [see, ground, realiti, statu, sardar, pathet, ...
4     50  [dirti, monei, aap, need, investig, lie, detec...
   index                                     stemmed_tokens
0    188  [hai, chor, sath, deta, hai, pich, leta, hai, ...
1     23  [travel, pass, markundi, toll, acp, toll, pvt,...
2    174  [final, todai, yet, mcd, remov, broken, bench,...
3     73  [crore, public, monei, invest, project, clean,...
4    152  [histor, judgement, suprem, court, put, stamp,...
   index  priority
0    164         0
1     36         0
2     92         0
3     42         1
4     50         0
   index  priority
0    188         1
1     23         1
2    174         0
3     73         1
4    152         0


In [105]:
from gensim.models import Word2Vec
import time

OUTPUT_FOLDER = '/Users/arjunkhanchandani/Desktop/twitter_data_analysis-main'

start_time = time.time()
tokens = pd.Series(tweets_df['stemmed_tokens']).values
# print(tokens)
word2vec_model_file = OUTPUT_FOLDER + 'word2vec_' + str(8000) + '.model'

w2v_model = Word2Vec(tokens, min_count=1, vector_size=8000, window=8, workers=7, sg=3)
print("Time taken to train word2vec model: " + str(time.time() - start_time))
w2v_model.save(word2vec_model_file)


Time taken to train word2vec model: 2.043767213821411


In [106]:
def create_file(create_file, model_file, x):
    sg_w2v_model = Word2Vec.load(model_file)
    
    with open(create_file, 'w+') as word2vec_file:
        for index, row in x.iterrows():
            model_vector = (np.mean([sg_w2v_model.wv[token] for token in row['stemmed_tokens']], axis=0)).tolist()
            if index == 0:
                header = ",".join(str(ele) for ele in range(8000))
                word2vec_file.write(header)
                word2vec_file.write("\n")
            
            if type(model_vector) is list:
                line1 = ",".join( [str(vector_element) for vector_element in model_vector] )
            else:
                line1 = ",".join([str(0) for i in range(8000)])
            word2vec_file.write(line1)
            word2vec_file.write('\n')
    
    df = pd.read_csv(create_file)
    return df
        

In [107]:
word2vec_train_filename = OUTPUT_FOLDER + 'word2vec_train_' + str(8000) + '.csv'
word2vec_train_df = create_file(word2vec_train_filename, word2vec_model_file, x_train)
print(word2vec_train_df.shape)
word2vec_train_df.head()

(138, 8000)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7990,7991,7992,7993,7994,7995,7996,7997,7998,7999
0,8.9e-05,-2.9e-05,8e-06,-3.9e-05,-2e-06,1e-05,-2.6e-05,3.6e-05,1.7e-05,2e-06,...,2.2e-05,-1.9e-05,5e-06,-4.3e-05,-3.1e-05,1.3e-05,-2.3e-05,2.7e-05,4e-06,3.9e-05
1,0.000128,-2.9e-05,2.5e-05,-2.2e-05,4.7e-05,-1.1e-05,-5e-05,9e-06,5.9e-05,-4e-06,...,4.8e-05,-5.9e-05,6.1e-05,5e-06,-2.2e-05,3.5e-05,-2.6e-05,-6e-06,-8e-06,1.8e-05
2,0.000156,-5.2e-05,5.2e-05,-0.0001,7.7e-05,1.9e-05,-4.6e-05,5.7e-05,0.000119,-3e-06,...,6.5e-05,-0.000115,2.2e-05,-2.6e-05,-6.9e-05,-3.5e-05,-3.4e-05,-2e-06,-4.5e-05,7.5e-05
3,0.000126,-6.2e-05,3.7e-05,-0.000113,6.7e-05,1.4e-05,-5.1e-05,2e-05,9e-05,7e-06,...,3.1e-05,-0.000109,3.7e-05,7e-06,-4.4e-05,-2.4e-05,-2.9e-05,2.3e-05,-2.2e-05,4.7e-05
4,0.000162,-7.8e-05,7.6e-05,-0.000148,5.9e-05,1.3e-05,-8.2e-05,6.5e-05,0.000126,3.2e-05,...,7e-05,-9.8e-05,7.3e-05,-4.2e-05,-3e-05,-4.6e-05,-7.6e-05,-2.8e-05,-1.1e-05,6.9e-05


In [108]:
word2vec_test_filename = OUTPUT_FOLDER + 'word2vec_test_' + str(8000) + '.csv'
word2vec_test_df = create_file(word2vec_test_filename, word2vec_model_file, x_test)
print(word2vec_test_df.shape)
word2vec_test_df.head()

(60, 8000)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7990,7991,7992,7993,7994,7995,7996,7997,7998,7999
0,0.000154,-4.6e-05,6.9e-05,-0.000102,5.2e-05,3.1e-05,-5e-05,5.9e-05,0.000134,3.8e-05,...,8.7e-05,-0.000155,6.7e-05,-4.4e-05,-4e-05,6e-06,-5.7e-05,-4.1e-05,1.1e-05,0.000108
1,9.3e-05,-4e-05,2.6e-05,-0.000109,5.3e-05,1e-05,-3.2e-05,4.7e-05,6.3e-05,1.5e-05,...,6.4e-05,-9.1e-05,2.8e-05,-3.2e-05,-3e-05,-6e-06,-2.1e-05,-2.5e-05,1.7e-05,9.3e-05
2,0.000133,-2.7e-05,4.5e-05,-4.2e-05,2.6e-05,3.7e-05,-2e-05,3.1e-05,5.7e-05,3.5e-05,...,5.6e-05,-6.6e-05,6.4e-05,-2.8e-05,-1.2e-05,-1.5e-05,-3.5e-05,-4.2e-05,8e-06,7.9e-05
3,0.000147,-6.9e-05,5.8e-05,-6.8e-05,0.000103,1.7e-05,-3e-05,6.3e-05,9.5e-05,6e-06,...,6.6e-05,-0.000133,7.5e-05,-3e-06,-5.2e-05,-2.3e-05,-2.3e-05,-3.3e-05,-2.9e-05,3.5e-05
4,8.4e-05,-4.4e-05,8e-06,-5.4e-05,6e-06,2.2e-05,-5.5e-05,1.4e-05,0.00012,2e-06,...,4.3e-05,-0.000103,4.2e-05,-3.8e-05,-5.4e-05,3e-06,-3.8e-05,7e-06,-3.8e-05,7.8e-05


In [109]:
print(y_train.shape)
print(y_test.shape)

(138, 2)
(60, 2)


# Decision Tree

In [110]:
# Initialize the model
clf_decision_word2vec = DecisionTreeClassifier()

start_time = time.time()
# Fit the model
clf_decision_word2vec.fit(word2vec_train_df, y_train['priority'])
print("Time taken to fit the model with word2vec vectors: " + str(time.time() - start_time))

Time taken to fit the model with word2vec vectors: 0.2762019634246826


In [111]:
from sklearn.metrics import classification_report
        
y_pred_word2vec = clf_decision_word2vec.predict(word2vec_test_df)
print(classification_report(y_test['priority'], y_pred_word2vec))

              precision    recall  f1-score   support

           0       0.59      0.63      0.61        27
           1       0.68      0.64      0.66        33

    accuracy                           0.63        60
   macro avg       0.63      0.63      0.63        60
weighted avg       0.64      0.63      0.63        60



https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=&cad=rja&uact=8&ved=2ahUKEwitu6WthJ77AhX_TmwGHYzTBjMQFnoECBcQAQ&url=https%3A%2F%2Fmedium.com%2F%40zafaralibagh6%2Fa-simple-word2vec-tutorial-61e64e38a6a1&usg=AOvVaw3tHKEk24OxG_LwAiMr2wZs