In [1]:
import numpy as np
import pandas as pd

In [2]:
import re
import gensim
from gensim.models.word2vec import Word2Vec

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [4]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

In [5]:
from keras.models import Sequential
from keras.layers import Dense

In [6]:
data = pd.read_csv('/content/tweets.csv')

In [7]:
pd.set_option('display.max_colwidth' , None)

In [8]:
data.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone
1,2,0,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/
2,3,0,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu
3,4,0,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/
4,5,1,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!


In [9]:
data.shape

(7920, 3)

In [10]:
data.drop('id',axis=1,inplace=True)

In [11]:
data.isna().sum()

label    0
tweet    0
dtype: int64

In [12]:
data['label'].value_counts()

0    5894
1    2026
Name: label, dtype: int64

In [13]:
#url removing
def remove_URL(sample):
  return re.sub(r"http\S+", "", sample)

In [14]:
data['text_clean_url-less'] = data['tweet'].apply(lambda x: remove_URL(x))

In [15]:
data['clean_tweet'] = data['text_clean_url-less'].apply(lambda x: gensim.utils.simple_preprocess(x))

In [16]:
data.head()

Unnamed: 0,label,tweet,text_clean_url-less,clean_tweet
0,0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone,#fingerprint #Pregnancy Test #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone,"[fingerprint, pregnancy, test, android, apps, beautiful, cute, health, igers, iphoneonly, iphonesia, iphone]"
1,0,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias…,"[finally, transparant, silicon, case, thanks, to, my, uncle, yay, sony, xperia, sonyexperias]"
2,0,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect...,"[we, love, this, would, you, go, talk, makememories, unplug, relax, iphone, smartphone, wifi, connect]"
3,0,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home,"[wired, know, george, was, made, that, way, iphone, cute, daventry, home]"
4,1,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!,"[what, amazing, service, apple, won, even, talk, to, me, about, question, have, unless, pay, them, for, their, stupid, support]"


In [17]:
data.drop(['tweet','text_clean_url-less'],axis = 1 ,inplace=True)

In [18]:
data.shape

(7920, 2)

In [19]:
x_train,x_test,y_train,y_test = train_test_split(data['clean_tweet'],data['label'],test_size=0.2,random_state=42)

In [20]:
model_gs = Word2Vec(x_train, min_count = 1)

In [21]:
model_gs.wv.most_similar('apple')

[('here', 0.9991608262062073),
 ('work', 0.9987317323684692),
 ('th', 0.9984205961227417),
 ('mac', 0.9983246326446533),
 ('fuckyou', 0.9982291460037231),
 ('at', 0.9980617761611938),
 ('am', 0.9980165362358093),
 ('ios', 0.9979788661003113),
 ('yet', 0.9979473948478699),
 ('app', 0.9978953003883362)]

In [22]:
words = model_gs.wv.index_to_key

In [23]:
len(words)

14946

In [24]:
x_train_vec = np.array([np.array([model_gs.wv[i] for i in ls if i in words]) for ls in x_train],dtype=object)
x_test_vec = np.array([np.array([model_gs.wv[i] for i in ls if i in words]) for ls in x_test],dtype=object)

In [25]:
len(x_train_vec[0][0])

100

In [26]:
x_train_vec.ndim

1

In [27]:
x_train_vec_avg = []
for v in x_train_vec:
  if v.size:
    x_train_vec_avg.append(v.mean(axis = 0))
  else:
    x_train_vec_avg.append(np.zeros(100,dtype=float))
x_test_vec_avg = []
for v in x_test_vec:
  if v.size:
    x_test_vec_avg.append(v.mean(axis = 0))
  else:
    x_test_vec_avg.append(np.zeros(100,dtype=float))

In [28]:
clf = RandomForestClassifier(n_estimators=100)
clf.fit(x_train_vec_avg,y_train)

In [29]:
#training accuracy
y_pred = clf.predict(x_train_vec_avg)
accuracy_score(y_pred,y_train)

0.9995265151515151

In [30]:
y_pred = clf.predict(x_test_vec_avg)
accuracy_score(y_pred,y_test)

0.8440656565656566

In [31]:
svc = SVC(kernel='rbf')

In [32]:
svc.fit(x_train_vec_avg,y_train)

In [33]:
y_pred = svc.predict(x_train_vec_avg)
accuracy_score(y_pred,y_train)

0.8416982323232324

In [34]:
y_pred = svc.predict(x_test_vec_avg)
accuracy_score(y_pred,y_test)

0.8428030303030303

In [35]:
xgb_clf = xgb.XGBClassifier(n_estimators=20 ,max_depth = 1, random_state = 42, learning_rate = 0.4, gamma = 0.1 )

In [36]:
xgb_clf.fit(x_train_vec_avg,y_train)

In [37]:
y_pred = xgb_clf.predict(x_train_vec_avg)
accuracy_score(y_pred,y_train)

0.8491161616161617

In [38]:
y_pred = xgb_clf.predict(x_test_vec_avg)
accuracy_score(y_pred,y_test)

0.8459595959595959

In [39]:
knn = KNeighborsClassifier(n_neighbors = 25, metric = 'minkowski',p = 1)

In [40]:
knn.fit(x_train_vec_avg,y_train)

In [41]:
y_pred = knn.predict(x_train_vec_avg)
accuracy_score(y_pred,y_train)

0.8503787878787878

In [42]:
y_pred = knn.predict(x_test_vec_avg)
accuracy_score(y_pred,y_test)

0.8415404040404041

In [43]:
model = Sequential()

In [44]:
model.add(Dense(85,activation = 'relu'))
model.add(Dense(80,activation = 'relu'))
model.add(Dense(65,activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid'))

In [45]:
model.compile(loss = 'binary_crossentropy',optimizer = 'adam',metrics = ['accuracy'])

In [46]:
x_train_new = np.array(x_train_vec_avg)
y_train_new = np.array(y_train)

In [47]:
history = model.fit(x_train_new,y_train_new,epochs=100,validation_split=0.2)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [48]:
x_test_new = np.array(x_test_vec_avg)
y_test_new = np.array(y_test)

In [49]:
model.evaluate(x_test_new,y_test_new)



[0.3328855633735657, 0.8491161465644836]

In [50]:
label_dict = {1 :'Negative sentiment on the product',0 :'Positive sentiment on the product'}

In [51]:
def sentiment_analyze(tweet):
  tweet = remove_URL(tweet)
  tweet1 = gensim.utils.simple_preprocess(tweet)
  tweet_vec_avg = np.array([model_gs.wv[i] for i in tweet1 if i in words]).mean(axis = 0)
  test = tweet_vec_avg.reshape(1,100)
  sentiment = model.predict(test)
  sentiment[sentiment<0.5] = 0
  sentiment[sentiment>0.5] = 1
  print(label_dict[sentiment[0][0]])

In [52]:
tweet = 'wow!!! my apple iphone 11 is simply the best #iphonelife #love #awesome'

In [58]:
sentiment_analyze(tweet)

Negative sentiment on the product


In [57]:
tweet2 = 'wow!! apple devices have become a nightmare.stupid phone'

In [55]:
sentiment_analyze(tweet2)

Negative sentiment on the product
