In [None]:
!pip install WordCloud

In [None]:
#import library

import numpy as np
import pandas as pd

import re
import nltk

import matplotlib.pyplot as plt
import seaborn as sns

import warnings

In [None]:
train = pd.read_csv('tweet_set_train.csv')
test = pd.read_csv('Twwet_test.csv')

print(train.shape)
print(test.shape)

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.isnull().any()
test.isnull().any()

In [None]:
# negative comments 

train[train['label'] == 0].head(10)

In [None]:
# postive comments 

train[train['label'] == 1].head(10)

In [None]:
train['label'].value_counts().plot.bar(color = 'green', figsize = (8, 6))

In [None]:
#  distribution of tweets 

length_train = train['tweet'].str.len().plot.hist(color = 'brown', figsize = (8, 6))
length_test = test['tweet'].str.len().plot.hist(color = 'blue', figsize = (8, 6))

In [None]:

train['len'] = train['tweet'].str.len()
test['len'] = test['tweet'].str.len()

train.head(10)

In [None]:
train.groupby('label').describe()

In [None]:
train.groupby('len').mean()['label'].plot.hist(color = 'pink', figsize = (6, 4),)
plt.title('variation accrding to twitter length')
plt.xlabel('Length')
plt.show()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer


cov = CountVectorizer(stop_words = 'english')
tweet_words = cov.fit_transform(train.tweet)

tweet_sum_words = tweet_words.sum(axis=0)

tweet_words_freq = [(tweet_word, tweet_sum_words[0, i]) for tweet_word, i in cov.vocabulary_.items()]
tweet_words_freq = sorted(tweet_words_freq, key = lambda x: x[1], reverse = True)

frequency = pd.DataFrame(tweet_words_freq, columns=['tweet_word', 'freq'])

frequency.head(50).plot(x='tweet_word', y='freq', kind='bar', figsize=(15, 7), color = 'orange')
plt.title("Most Frequently Occuring Words in the twitter ")

In [None]:
from wordcloud import WordCloud

twitter_wordcloud = WordCloud(background_color = 'white', width = 1200, height = 1200).generate_from_frequencies(dict(tweet_words_freq))

plt.figure(figsize=(10,8))
plt.imshow(twitter_wordcloud)
plt.title("WordCloud - Vocabulary from Reviews", fontsize = 20)

In [None]:
positive_words =' '.join([text for text in train['tweet'][train['label'] == 0]])

twitterwordcloud = WordCloud(width=1000, height=800, random_state = 0, max_font_size = 110).generate(positive_words)
plt.figure(figsize=(10, 7))
plt.imshow(twitterwordcloud, interpolation="bilinear")
plt.axis('off')
plt.title('Twitter Positive Words')
plt.show()

In [None]:
negative_words =' '.join([text for text in train['tweet'][train['label'] == 1]])

twitterwordcloud = WordCloud(background_color = 'cyan', width=800, height=500, random_state = 0, max_font_size = 110).generate(negative_words)
plt.figure(figsize=(10, 7))
plt.imshow(twitterwordcloud, interpolation="bilinear")
plt.axis('off')
plt.title('Twitter Negative Words')
plt.show()

In [None]:
# collect the hashtags

def twitter_hashtag_extract(x):
    twitter_hashtags = []
    
    for i in x:
        ht = re.findall(r"#(\w+)", i)
        twitter_hashtags.append(ht)

    return twitter_hashtags

In [None]:
# extracting hashtags from non positive tweets
TH_positive = twitter_hashtag_extract(train['tweet'][train['label'] == 0])

# extracting hashtags from negative tweets
TH_negative = twitter_hashtag_extract(train['tweet'][train['label'] == 1])


TH_positive = sum(TH_positive,[])
TH_negative = sum(TH_negative,[])

In [None]:
twitter_FD = nltk.FreqDist(TH_positive)
twiteer_Data = pd.DataFrame({'twitter_Hashtag': list(twitter_FD.keys()),
                  'twitter_Count': list(twitter_FD.values())})

# top 20 most frequent hashtags     
twiteer_Data = twiteer_Data.nlargest(columns="twitter_Count", n = 25) 
plt.figure(figsize=(16,5))
ax = sns.barplot(data=twiteer_Data, x= "twitter_Hashtag", y = "twitter_Count")
ax.set(ylabel = 'Number of Hashtags')
plt.show()

In [None]:
twitter_FD = nltk.FreqDist(TH_negative)
twiteer_Data = pd.DataFrame({'twitter_Hashtag': list(twitter_FD.keys()),
                  'twitter_Count': list(twitter_FD.values())})

#  top 20 most frequent hashtags     
twiteer_Data = twiteer_Data.nlargest(columns="twitter_Count", n = 25) 
plt.figure(figsize=(16,5))
ax = sns.barplot(data=twiteer_Data, x= "twitter_Hashtag", y = "twitter_Count")
ax.set(ylabel = 'Number of Hashtags')
plt.show()

In [None]:
# tokenizing the words present in the training set
tokenized_tweet_set = train['tweet'].apply(lambda x: x.split()) 

# importing gensim
import gensim

# vector model
twitter_model = gensim.models.Word2Vec(
            tokenized_tweet_set,
            size=200, 
            window=5, 
            min_count=2,
            sg = 1, 
            hs = 0,
            negative = 10, 
            workers= 2, 
            seed = 34)

twitter_model.train(tokenized_tweet_set, total_examples= len(train['tweet']), epochs=20)

In [None]:
twitter_model.wv.most_similar(positive = "can")

In [None]:
twitter_model.wv.most_similar(negative = "hate")

In [None]:
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models.doc2vec import LabeledSentence

def add_label_twitter(twitter):
    output = []
    for i, s in zip(twitter.index, twitter):
        output.append(LabeledSentence(s, ["tweet_" + str(i)]))
    return output

# label all the tweets
labeled_tweets = add_label_twitter(tokenized_tweet_set)

labeled_tweets[:6]

In [None]:


nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

Twtrain_corpus = []

for i in range(0, 31962):
  Treview = re.sub('[^a-zA-Z]', ' ', train['tweet'][i])
  Treview = Treview.lower()
  Treview = Treview.split()
  
  twitter_ps = PorterStemmer()
  
  # stemming
  Treview = [twitter_ps.stem(word) for word in Treview if not word in set(stopwords.words('english'))]
  
  # joining them  with space
  Treview = ' '.join(Treview)
  Twtrain_corpus.append(Treview)

In [None]:
Twtest_corpus = []

for i in range(0, 58):
  Treview = re.sub('[^a-zA-Z]', ' ', test['tweet'][i])
  Treview = Treview.lower()
  Treview = Treview.split()
  
  twitter_ps = PorterStemmer()
  
  # stemming
  review = [twitter_ps.stem(word) for word in Treview if not word in set(stopwords.words('english'))]
  
  # joining them with space
  Treview = ' '.join(Treview)
  Twtest_corpus.append(Treview)

In [None]:

from sklearn.feature_extraction.text import CountVectorizer

cv_train = CountVectorizer(max_features = 707)
x_twt = cv_train.fit_transform(Twtrain_corpus).toarray()
y_twt = train.iloc[:, 1]

print(x_twt.shape)
print(y_twt.shape)

In [None]:

from sklearn.feature_extraction.text import CountVectorizer

cv_test = CountVectorizer(max_features = 707)
x_test = cv_test.fit_transform(Twtest_corpus).toarray()  

print(x_test.shape)

In [None]:
# splitting the training data 

from sklearn.model_selection import train_test_split

x_tra, x_val, y_tra, y_val = train_test_split(x_twt, y_twt, test_size = 0.25, random_state = 42)

print(x_tra.shape)
print(x_val.shape)
print(y_tra.shape)
print(y_val.shape)

In [None]:
# standardization data set

from sklearn.preprocessing import StandardScaler

twitter_sc = StandardScaler()

x_tra = twitter_sc.fit_transform(x_tra)
x_val = twitter_sc.transform(x_val)



In [None]:
from sklearn.preprocessing import StandardScaler

twitter_sc = StandardScaler()
x_test = twitter_sc.fit_transform(x_test)
x_tst = twitter_sc.transform(x_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

twitter_model = RandomForestClassifier()
twitter_model.fit(x_tra, y_tra)

y_prd = twitter_model.predict(x_val)

print("Training Accuracy :", twitter_model.score(x_tra, y_tra))
print("Validation Accuracy :", twitter_model.score(x_val, y_val))

# calculating the f1 score for the validation set
print("F1 score :", f1_score(y_val, y_prd))

# confusion matrix
cm = confusion_matrix(y_val, y_prd)
print(cm)


In [None]:
y_prd_new = twitter_model.predict(x_tst)

In [None]:
y_prd_new