<a href="https://colab.research.google.com/github/abakm/AL-ML_Assignment-1/blob/master/Assignment_05Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Install  gensim

In [169]:
pip install gensim



# Import libraries

In [170]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer



nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Load dataset

In [171]:
df = pd.read_csv('./tweets.csv')
df.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


# Preprocessing

### Create preprocess function

In [172]:
lemmatizer = WordNetLemmatizer()
STOPWORDS = stopwords.words('english')
def preprocess_tweet(words):
    words = simple_preprocess(words)
    words = [word.lower() for word in words if word.isalpha()] # filter only alphabets
    words = [word for word in words if word not in STOPWORDS] # Remove stop words
    words = [lemmatizer.lemmatize(word) for word in words] # Lemmanizatio
    return words

## Preprocess target and features

In [173]:
tweets= df['tweet'].apply(lambda tweet: preprocess_tweet(tweet))
labels  = df['label']
tweets.head()

Unnamed: 0,tweet
0,"[fingerprint, pregnancy, test, http, goo, gl, ..."
1,"[finally, transparant, silicon, case, thanks, ..."
2,"[love, would, go, talk, makememories, unplug, ..."
3,"[wired, know, george, made, way, iphone, cute,..."
4,"[amazing, service, apple, even, talk, question..."


# Split dataset

In [174]:
x_train, x_test, y_train, y_test = train_test_split(tweets, labels, test_size=0.2)


# Vectorization

In [175]:
w2v_model = Word2Vec(x_train, min_count=1)
words = w2v_model.wv.index_to_key
words

['http',
 'iphone',
 'com',
 'apple',
 'instagram',
 'samsung',
 'new',
 'twitter',
 'phone',
 'sony',
 'instagr',
 'follow',
 'pic',
 'www',
 'like',
 'ipad',
 'love',
 'io',
 'day',
 'android',
 'life',
 'galaxy',
 'photo',
 'case',
 'rt',
 'ly',
 'cute',
 'app',
 'photography',
 'get',
 'today',
 'back',
 'gain',
 'fun',
 'itunes',
 'got',
 'music',
 'bit',
 'news',
 'work',
 'happy',
 'instagood',
 'time',
 'smile',
 'one',
 'beautiful',
 'girl',
 'co',
 'funny',
 'make',
 'lol',
 'p',
 'ipod',
 'birthday',
 'apps',
 'friend',
 'tech',
 'fashion',
 'photooftheday',
 'game',
 'iphonex',
 'finally',
 'update',
 'everyone',
 'note',
 'tt',
 'mac',
 'amazing',
 'gift',
 'good',
 'product',
 'selfie',
 'must',
 'sougofollow',
 'want',
 'fuck',
 'tmblr',
 'sale',
 'home',
 'would',
 'follower',
 'free',
 'family',
 'igers',
 'fail',
 'thanks',
 'best',
 'camera',
 'baby',
 'hate',
 'look',
 'go',
 'fucking',
 'cool',
 'art',
 'rts',
 'year',
 'iphonesia',
 'plus',
 'iphoneonly',
 'mobile

# Find vectors of each word in tweets

###Define functionality for vectorization

In [176]:
def get_vectors(tweets):
  vectors = list()
  #Iterate throgh each message
  for tweet in tweets:
  #Initialize an empty list to store word vectors of current list of tokens
    tweet_vectors = list()
    for word in tweet:
      if word in words:
        tweet_vectors.append(w2v_model.wv[word])

    if len(tweet_vectors):
      vectors.append(np.mean(tweet_vectors, axis=0))
    else:
      vectors.append(np.zeros(100, dtype=float))

  return vectors

### Get vectors for train and test tweets

In [177]:
x_test_vectors_avg = get_vectors(x_test)
x_train_vectors_avg = get_vectors(x_train)

print("x_train_vectors_avg:", x_train_vectors_avg)
print("x_test_vectors_avg:", x_test_vectors_avg)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
        0.7470612 , -0.19881317,  0.2598495 , -0.8673099 , -0.42150602,
        0.84126294,  0.26974252, -0.47245455, -0.3991294 , -0.9784382 ,
        0.1287408 , -0.35939178, -0.13617635,  0.1615326 ,  0.64744496,
       -0.5064527 , -0.6043167 , -0.11080668,  0.37338683,  0.44587612,
        0.54516023, -0.6395032 , -0.2171644 , -0.14961134, -0.56085145,
        0.19530001,  0.5301557 , -0.05688148, -0.7315437 ,  0.2619358 ,
        0.3115248 ,  0.02663043,  0.02464133, -0.26664966, -0.96980894,
        0.7994301 ,  0.11154628,  0.7058446 , -0.83799505,  0.58807343,
       -0.22283527,  0.37583488,  0.7387277 ,  0.09029388,  0.7178589 ,
        0.4581378 , -0.11653414,  0.00864862, -0.49179983, -0.05368882,
       -0.44913918, -0.08701909, -0.2672785 ,  0.8107775 , -0.30536932,
       -0.0957105 ,  0.4325026 ,  0.54542005,  0.50334466,  0.28145945,
        0.58582526,  0.17212614,  0.19052885, -0.15130898,  1.3874396 ,

# Create Model

In [178]:
rf = RandomForestClassifier()

# Fit the model

In [179]:
rf.fit(x_train_vectors_avg, y_train)

# Predict the model

In [180]:
predict = rf.predict(x_test_vectors_avg)
accuracy_score(y_test, predict) # Evaluation metric

0.8743686868686869

#Predictation

In [181]:
text = "id totally fuck siri her knowledge excites me sad"
text = preprocess_tweet(text)
if set(text).intersection(set(words)):
  text = get_vectors([text])
  print("positive" if rf.predict(text) == [1] else "negative")
else:
  print("Words in the text  not in my vocabulary")

positive
