<a href="https://colab.research.google.com/github/abakm/AL-ML_Assignment-1/blob/master/Assignment_05Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Install  gensim

In [143]:
pip install gensim



# Import libraries

In [144]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize



nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Load dataset

In [145]:
df = pd.read_csv('./tweets.csv')
df.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


# Preprocessing

### Create preprocess function

In [146]:
lemmatizer = WordNetLemmatizer()
STOPWORDS = stopwords.words('english')
def preprocess_tweet(words):
    words = simple_preprocess(words)
    words = [word.lower() for word in words if word.isalpha()] # filter only alphabets
    words = [word for word in words if word not in STOPWORDS] # Remove stop words
    words = [lemmatizer.lemmatize(word) for word in words] # Lemmanizatio
    return words

## Preprocess target and features

In [147]:
tweets= df['tweet'].apply(lambda tweet: preprocess_tweet(tweet))
labels  = df['label']
tweets.head()

Unnamed: 0,tweet
0,"[fingerprint, pregnancy, test, http, goo, gl, ..."
1,"[finally, transparant, silicon, case, thanks, ..."
2,"[love, would, go, talk, makememories, unplug, ..."
3,"[wired, know, george, made, way, iphone, cute,..."
4,"[amazing, service, apple, even, talk, question..."


# Split dataset

In [148]:
x_train, x_test, y_train, y_test = train_test_split(tweets, labels, test_size=0.2)


# Vectorization

In [149]:
w2v_model = Word2Vec(x_train, min_count=1)
words = w2v_model.wv.index_to_key
words

['http',
 'iphone',
 'com',
 'apple',
 'instagram',
 'samsung',
 'twitter',
 'new',
 'phone',
 'sony',
 'instagr',
 'follow',
 'www',
 'pic',
 'like',
 'ipad',
 'love',
 'io',
 'day',
 'photo',
 'android',
 'case',
 'life',
 'galaxy',
 'rt',
 'ly',
 'get',
 'cute',
 'photography',
 'today',
 'back',
 'app',
 'gain',
 'got',
 'itunes',
 'fun',
 'music',
 'news',
 'bit',
 'happy',
 'work',
 'instagood',
 'time',
 'smile',
 'beautiful',
 'co',
 'girl',
 'lol',
 'funny',
 'one',
 'fashion',
 'ipod',
 'game',
 'tech',
 'make',
 'friend',
 'apps',
 'finally',
 'iphonex',
 'p',
 'birthday',
 'update',
 'photooftheday',
 'good',
 'tt',
 'everyone',
 'note',
 'gift',
 'product',
 'amazing',
 'must',
 'selfie',
 'mac',
 'sougofollow',
 'follower',
 'thanks',
 'tmblr',
 'free',
 'camera',
 'fail',
 'fuck',
 'year',
 'want',
 'go',
 'would',
 'hate',
 'family',
 'best',
 'home',
 'rts',
 'look',
 'cool',
 'gl',
 'igers',
 'baby',
 'suck',
 'plus',
 'sale',
 'iphoneonly',
 'picoftheday',
 'fucking'

# Find vectors of each word in tweets

###Define functionality for vectorization

In [150]:
def get_vectors(tweets):
  vectors = list()
  #Iterate throgh each message
  for tweet in tweets:
  #Initialize an empty list to store word vectors of current list of tokens
    tweet_vectors = list()
    for word in tweet:
      if word in words:
        tweet_vectors.append(w2v_model.wv[word])

    if len(tweet_vectors):
      vectors.append(np.mean(tweet_vectors, axis=0))
    else:
      vectors.append(np.zeros(100, dtype=float))

  return vectors

### Get vectors for train and test tweets

In [151]:
x_test_vectors_avg = get_vectors(x_test)
x_train_vectors_avg = get_vectors(x_train)

print("x_train_vectors_avg:", x_train_vectors_avg)
print("x_test_vectors_avg:", x_test_vectors_avg)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
        2.0658092e-01, -1.0475326e+00,  2.6147690e-01,  1.3237361e+00,
       -2.8489774e-01, -5.6882274e-01, -1.8082467e-01, -8.5791963e-01,
        8.2342193e-02,  3.3168665e-01,  1.5775645e-01, -4.5030409e-01,
        5.3281581e-01, -7.3711789e-01, -4.2377958e-01, -1.3082668e+00,
        5.9249812e-01,  5.7402920e-02,  4.5551047e-01, -4.1618067e-01,
       -1.6823666e-01, -5.5021882e-02, -4.3610638e-01, -3.2949877e-01,
       -7.6823580e-01,  6.4672731e-02,  9.1604292e-01, -4.7939621e-02,
        2.3781590e-01, -5.4587466e-01, -1.9554459e-01,  7.1814549e-01,
        4.3753916e-01, -4.4429126e-01, -1.9684942e-01, -8.6000073e-01,
        3.1223100e-01, -5.3486854e-01, -3.9953563e-01,  1.5563250e-01,
        4.8905113e-01, -2.1399498e-01, -5.5080569e-01,  9.8487735e-02,
        1.6061309e-01,  3.8051003e-01,  2.9969293e-01, -8.0250913e-01,
       -6.6440213e-01, -8.3713338e-02, -3.3976939e-01,  4.0762499e-01,
        5.00

# Create Model

In [152]:
rf = RandomForestClassifier()

# Fit the model

In [153]:
rf.fit(x_train_vectors_avg, y_train)

# Evaluate the model

In [154]:
predict = rf.predict(x_test_vectors_avg)
accuracy_score(y_test, predict)

0.8718434343434344

#Predict the model

In [155]:
text = "id totally fuck siri her knowledge excites me sad"
text = preprocess_tweet(text)
if set(text).intersection(set(words)):
  text = get_vectors([text])
  print("positive" if rf.predict(text) == [1] else "negative")
else:
  print("Words in the text  not in my vocabulary")

positive
