In [1]:
# Importing all the required packages for this task

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
# from sklearn import metrics

In [7]:
tweets_df = pd.read_csv("clean_processed_tweet_data.csv")

In [8]:
tweets_df.head()

Unnamed: 0.1,Unnamed: 0,created_at,source,original_text,polarity,subjectivity,lang,favorite_count,retweet_count,original_author,followers_count,friends_count,possibly_sensitive,hashtags,user_mentions,place
0,38,2022-04-22 22:17:05+00:00,"<a href=""http://twitter.com/download/android"" ...",RT @NorthstarCharts: The 10-year yield is tell...,0.16,0.54,en,188,43,davideiacovozzi,18,55,,"gold, silver, crypto",NorthstarCharts,
1,39,2022-04-22 13:44:53+00:00,"<a href=""http://twitter.com/download/android"" ...",RT @MichaelAArouet: German 10y mortgage rate w...,0.15,0.175,en,179,32,davideiacovozzi,18,55,,,MichaelAArouet,
2,41,2022-04-22 06:10:34+00:00,"<a href=""http://twitter.com/download/android"" ...",RT @goldseek: When? https://t.co/kO2FfHKaZg,0.0,0.0,en,193,26,davideiacovozzi,18,55,False,,goldseek,
3,42,2022-04-21 17:22:09+00:00,"<a href=""http://twitter.com/download/android"" ...",RT @charliebilello: The 30-year mortgage rate ...,0.0,0.183333,en,620,213,davideiacovozzi,18,55,,,charliebilello,
4,43,2022-04-21 10:32:26+00:00,"<a href=""http://twitter.com/download/android"" ...",RT @biancoresearch: Rates rise until something...,-0.4,0.4,en,1787,417,davideiacovozzi,18,55,False,,biancoresearch,


### Data Understanding

In [9]:
tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16472 entries, 0 to 16471
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          16472 non-null  int64  
 1   created_at          16472 non-null  object 
 2   source              16472 non-null  object 
 3   original_text       16472 non-null  object 
 4   polarity            16472 non-null  float64
 5   subjectivity        16472 non-null  float64
 6   lang                16472 non-null  object 
 7   favorite_count      16472 non-null  int64  
 8   retweet_count       16472 non-null  int64  
 9   original_author     16472 non-null  object 
 10  followers_count     16472 non-null  int64  
 11  friends_count       16472 non-null  int64  
 12  possibly_sensitive  6206 non-null   object 
 13  hashtags            5278 non-null   object 
 14  user_mentions       12165 non-null  object 
 15  place               10765 non-null  object 
dtypes: f

In [11]:
# Check the number of missing values

print("The number of missing value(s): {}".format(tweets_df.isnull().sum().sum()))
print("Columons having columns value:{}".format(tweets_df.columns[tweets_df.isnull().any()]))

The number of missing value(s): 31474
Columons having columns value:Index(['possibly_sensitive', 'hashtags', 'user_mentions', 'place'], dtype='object')


In [13]:
tweets_df['original_text'] =  tweets_df['original_text'].to_list()

In [16]:
# drop row having Null value
tweets_df.dropna()
tweets_df['polarity'] = tweets_df['polarity']
tweets_df

Unnamed: 0.1,Unnamed: 0,created_at,source,original_text,polarity,subjectivity,lang,favorite_count,retweet_count,original_author,followers_count,friends_count,possibly_sensitive,hashtags,user_mentions,place
0,38,2022-04-22 22:17:05+00:00,"<a href=""http://twitter.com/download/android"" ...",RT @NorthstarCharts: The 10-year yield is tell...,0.16,0.540000,en,188,43,davideiacovozzi,18,55,,"gold, silver, crypto",NorthstarCharts,
1,39,2022-04-22 13:44:53+00:00,"<a href=""http://twitter.com/download/android"" ...",RT @MichaelAArouet: German 10y mortgage rate w...,0.15,0.175000,en,179,32,davideiacovozzi,18,55,,,MichaelAArouet,
2,41,2022-04-22 06:10:34+00:00,"<a href=""http://twitter.com/download/android"" ...",RT @goldseek: When? https://t.co/kO2FfHKaZg,0.00,0.000000,en,193,26,davideiacovozzi,18,55,False,,goldseek,
3,42,2022-04-21 17:22:09+00:00,"<a href=""http://twitter.com/download/android"" ...",RT @charliebilello: The 30-year mortgage rate ...,0.00,0.183333,en,620,213,davideiacovozzi,18,55,,,charliebilello,
4,43,2022-04-21 10:32:26+00:00,"<a href=""http://twitter.com/download/android"" ...",RT @biancoresearch: Rates rise until something...,-0.40,0.400000,en,1787,417,davideiacovozzi,18,55,False,,biancoresearch,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16467,24584,2022-04-22 15:22:56+00:00,"<a href=""http://twitter.com/download/iphone"" r...",RT @CHARANJITCHANNI: Best wishes &amp; heartfe...,0.50,0.729630,en,2924,300,kitukalesatya,706,643,,,"CHARANJITCHANNI, RajaBrar_INC, BB__Ashu",
16468,24586,2022-04-22 15:22:29+00:00,"<a href=""http://twitter.com/download/iphone"" r...",RT @pbhushan1: Thank you @BajpayeeManoj for th...,0.85,1.000000,en,14671,5006,kitukalesatya,706,643,,,"pbhushan1, BajpayeeManoj",
16469,24596,2022-04-22 15:01:27+00:00,"<a href=""http://twitter.com/download/iphone"" r...",RT @s_shreyatweets: Agree ?👇 https://t.co/R54Z...,0.00,0.000000,en,5056,973,kitukalesatya,706,643,False,,s_shreyatweets,
16470,24599,2022-04-22 14:58:12+00:00,"<a href=""http://twitter.com/download/iphone"" r...",RT @tejjINC: 1. Peace Yatra by Late Sunil Dutt...,-0.30,0.600000,en,636,115,kitukalesatya,706,643,False,,tejjINC,


In [17]:
clean_tweet = tweets_df[['original_text','polarity']]

In [18]:
clean_tweet.head()

Unnamed: 0,original_text,polarity
0,RT @NorthstarCharts: The 10-year yield is tell...,0.16
1,RT @MichaelAArouet: German 10y mortgage rate w...,0.15
2,RT @goldseek: When? https://t.co/kO2FfHKaZg,0.0
3,RT @charliebilello: The 30-year mortgage rate ...,0.0
4,RT @biancoresearch: Rates rise until something...,-0.4


In [19]:
def text_category (polarity):
    if polarity > 0:
        return 'positive'
    if polarity < 0:
        return 'negative'
    else:
        return 'neutral'

#### Score

In [20]:
score = pd.Series([text_category(row_value) for row_value in clean_tweet['polarity']])

In [21]:
clean_tweet = pd.concat([clean_tweet, score.rename("score")], axis=1)

In [22]:
clean_tweet.head(n=10)

Unnamed: 0,original_text,polarity,score
0,RT @NorthstarCharts: The 10-year yield is tell...,0.16,positive
1,RT @MichaelAArouet: German 10y mortgage rate w...,0.15,positive
2,RT @goldseek: When? https://t.co/kO2FfHKaZg,0.0,neutral
3,RT @charliebilello: The 30-year mortgage rate ...,0.0,neutral
4,RT @biancoresearch: Rates rise until something...,-0.4,negative
5,RT @LanceRoberts: Buying opportunities like th...,-0.1,negative
6,RT @MacroAlf: Welcome to September 2018.\n\nBo...,0.4,positive
7,RT @BotBenFranklin: The horse thinks one thing...,0.0,neutral
8,RT @Galactic_Trader: Global growth optimism at...,0.488281,positive
9,RT @AndreasSteno: This is the most important c...,0.45,positive


In [23]:
len(clean_tweet['polarity'])

16472

In [24]:
scoremap = pd.Series([1 if row_value == 'positive' else 0 for row_value in clean_tweet['score']])

In [346]:
len(scoremap)

16472

In [347]:
clean_tweet['scoremap'] = scoremap
X = clean_tweet['original_text']
y = clean_tweet['scoremap']

In [348]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [349]:
print(X_train.size,',' , X_test.size)
print(X_test.size, ',',y_test.size)

13177 , 3295
3295 , 3295


In [350]:
# scaling the input
clf = LinearSVC()

In [351]:
unigram_tfidf_vect = TfidfVectorizer(max_features=10000, ngram_range = (1, 1))
bigram_tfidf_vect = TfidfVectorizer(max_features=10000, ngram_range = (2, 2))
hybrid_tfidf_vect = TfidfVectorizer(max_features=10000, ngram_range = (1, 2))

## Unigram TF-IDF

In [352]:
X_train = X_train.replace(np.nan, '', regex=True)
X_train_counts = unigram_tfidf_vect.fit_transform(X_train)
X_train_counts = X_train_counts.toarray()
clf.fit(X_train_counts, y_train)

LinearSVC()

In [353]:
X_test = X_test.replace(np.nan, '', regex=True)
X_test_counts = unigram_tfidf_vect.transform(X_test)
X_test_counts = X_test_counts.toarray()
prediction = clf.predict(X_test_counts)

In [354]:
np.mean(prediction == y_test)

0.8965098634294385

## Bigram TF-IDF

In [355]:
X_train = X_train.replace(np.nan, '', regex=True)
X_train_counts = bigram_tfidf_vect.fit_transform(X_train)
X_train_counts = X_train_counts.toarray()
clf.fit(X_train_counts, y_train)

LinearSVC()

In [356]:
X_test = X_test.replace(np.nan, '', regex=True)
X_test_counts = bigram_tfidf_vect.transform(X_test)
X_test_counts = X_test_counts.toarray()
prediction = clf.predict(X_test_counts)

In [357]:
np.mean(prediction == y_test)

0.7414264036418816

## Hybridgram TF-IDF

In [358]:
# X_train = X_train.replace(np.nan, '', regex=True)
X_train_counts = hybrid_tfidf_vect.fit_transform(X_train)
X_train_counts = X_train_counts.toarray()
clf.fit(X_train_counts, y_train)

LinearSVC()

In [359]:
X_test = X_test.replace(np.nan, '', regex=True)
X_test_counts = hybrid_tfidf_vect.transform(X_test)
X_test_counts = X_test_counts.toarray()
prediction = clf.predict(X_test_counts)

In [360]:
np.mean(prediction == y_test)

0.8977238239757208