# NLP: Bag of Words - Predicting if a Tweet about Apple is Negative or not


In [None]:
"""
The dataset tweets.csv contains the tweet and average sentiment of the tweet.
Avg has is the average of 5 sentiments - 
-2 : Strongly Negative 
-1 : Negative 
 0 : Neutral
 1 : Positive
 2 : Strongly Positive 
""" 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [156]:
dataset = pd.read_csv("tweets.csv")

In [139]:
dataset.head()

Unnamed: 0,Tweet,Avg
0,"I have to say, Apple has by far the best custo...",2.0
1,iOS 7 is so fricking smooth & beautiful!! #Tha...,2.0
2,LOVE U @APPLE,1.8
3,"Thank you @apple, loving my new iPhone 5S!!!!!...",1.8
4,.@apple has the best customer service. In and ...,1.8


In [221]:
""" 
We consider True for avg <= -1 and False otherwise (in terms of negativity).
avg <= -1 -> Negative (True)
avg > -1  -> Not Negative (False)
"""
dataset['Negative'] = list(map(int, dataset['Avg'] <= -1))    

In [226]:
dataset.head()

Unnamed: 0,Tweet,Avg,Negative
0,"I have to say, Apple has by far the best custo...",2.0,0
1,iOS 7 is so fricking smooth & beautiful!! #Tha...,2.0,0
2,LOVE U @APPLE,1.8,0
3,"Thank you @apple, loving my new iPhone 5S!!!!!...",1.8,0
4,.@apple has the best customer service. In and ...,1.8,0


# Using NLTK library

In [141]:
import nltk
#nltk.download()  #I have downloaded all required packages from NLTK

In [142]:
#importing stopwords
from nltk.corpus import stopwords             
stop_words = set(stopwords.words('english'))

In [157]:
#for stemming
from nltk.stem.porter import PorterStemmer    
porter = PorterStemmer()

In [186]:
corpus = []              # to hold the cleaned tweets

for i in range(len(dataset)):
    #remove punctuation
    tweet = re.sub('[^a-zA-Z]', ' ', dataset['Tweet'][i])
    
    #convert to lower case
    tweet = tweet.lower()   
    
    #split it into words
    tweet = word_tokenize(tweet)
    
    #removing stopwords
    tweet = [word for word in tweet if not word in stop_words]
    
    #stemming words
    tweet = [porter.stem(word) for word in tweet]
    
    # joining all words back into a string
    tweet = ' '.join(tweet)
    
    #append cleaned tweets to corpus
    corpus.append(tweet)

In [187]:
corpus[0]

'say appl far best custom care servic ever receiv appl appstor'

# Generating Document Term Matrix or Bag of Words Model

In [190]:
from sklearn.feature_extraction.text import CountVectorizer

In [227]:
cv = CountVectorizer(max_features = 3000)    #limiting the number of words to top 3000 frequent words
X = cv.fit_transform(corpus).toarray()       #DTM with words and their frequencies
#X now has independent variables
#y has dependent variable
y = dataset.iloc[:,2].values

In [203]:
#no of words
len(X[0]) 

3000

# Training the model

In [232]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [238]:
from sklearn.tree import DecisionTreeClassifier

classifier = DecisionTreeClassifier(random_state=42)
classifier = classifier.fit(X_train,y_train)

y_pred=classifier.predict(X_test)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

cm = metrics.confusion_matrix(y_test, y_pred)
print("Consfusion Matrix:")
print(cm)

Accuracy: 0.8845070422535212
Consfusion Matrix:
[[291  22]
 [ 19  23]]


In [248]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators=50, random_state=42)
classifier = classifier.fit(X_train,y_train)

y_pred=classifier.predict(X_test)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

cm = metrics.confusion_matrix(y_test, y_pred)
print("Consfusion Matrix:")
print(cm)

Accuracy: 0.9154929577464789
Consfusion Matrix:
[[310   3]
 [ 27  15]]
