In [5]:
import numpy as np
import pandas as pd


In [10]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [11]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Aayush
[nltk_data]     Gupta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
# printing the stop words in English
stop_words = set(stopwords.words('english'))
print(stop_words)

{"isn't", 'are', "it'd", 'be', 'then', "that'll", 'theirs', 'there', "mustn't", 'been', "should've", "shouldn't", "they're", "won't", "shan't", 'wasn', 'not', 'do', 'why', 'when', 'here', "mightn't", 'hers', 'on', 'themselves', 'once', 'am', 'mustn', 'mightn', 'own', "she's", "they've", 'did', 'were', 'for', 'more', 'him', 'herself', "hadn't", "we're", 'down', 'i', 'hasn', 'only', 'during', "you'll", 'ours', 'them', 'between', 'we', 'she', 'which', 'or', 'these', 'yours', 'weren', 'where', "wasn't", "hasn't", "we'd", 'doesn', "she'd", 'further', 't', "needn't", "don't", 'through', 'under', 'until', 'now', 'about', 'to', 'ourselves', 'up', 'will', 'aren', 'their', "he'll", "i'll", 'what', 'isn', "i've", 'wouldn', 're', 'in', 'off', 'is', 'does', 'no', 'm', 'so', 'has', 'y', 'too', 'yourselves', 'himself', 'd', 'very', "you've", 'll', 'he', 'while', 'same', "he's", 'our', "you'd", 'few', "didn't", "they'd", 'each', 'they', 'hadn', 'but', 'don', 'had', "weren't", 'shouldn', 'of', 'shan', 

# Data Processing 

In [14]:
twitter_data = pd.read_csv('training.1600000.processed.noemoticon.csv' ,encoding = 'ISO-8859-1')

In [18]:
twitter_data.shape

(1599999, 6)

In [16]:
twitter_data.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [21]:
# naming the colums and reading dataset again 
column_names = ['Target', 'Ids', 'Date', 'Flag', 'User', 'Text']
twitter_data = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1', names=column_names)
twitter_data.head()

Unnamed: 0,Target,Ids,Date,Flag,User,Text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [22]:
# counting the number of missing values in each column
twitter_data.isnull().sum()

Target    0
Ids       0
Date      0
Flag      0
User      0
Text      0
dtype: int64

In [23]:
# checking the distribution of target variable
twitter_data['Target'].value_counts()

Target
0    800000
4    800000
Name: count, dtype: int64

In [26]:
# convert the target "4" to "1" and "0" to "0"
twitter_data.replace({'Target' :{4,1}} , inplace=True)
twitter_data.tail()

Unnamed: 0,Target,Ids,Date,Flag,User,Text
1599995,1,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,1,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,1,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,1,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...
1599999,1,2193602129,Tue Jun 16 08:40:50 PDT 2009,NO_QUERY,RyanTrevMorris,happy #charitytuesday @theNSPCC @SparksCharity...


In [27]:
twitter_data['Target'].value_counts()

Target
0    800000
1    800000
Name: count, dtype: int64

0 --> negative tweet      1 --> positive tweet


# Stemming

In [28]:
port_stem = PorterStemmer()

In [35]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [None]:
twitter_data['stemmed_content'] = twitter_data['Text'].apply(stemming)

In [None]:
twitter_data.head()

In [None]:
print(twitter_data( ['Target'],['stemmed_content']))

In [None]:
# separating the data and label
X = twitter_data['stemmed_content'].values
Y = twitter_data['Target'].values

In [None]:
X

In [None]:
Y

# splitting the data to training and test data 

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, startify = Y ,random_state=2)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

In [None]:
print(Y.shape, Y_train.shape, Y_test.shape)

# converting the textual data to numerical data 

In [None]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [None]:
print(X_train)

In [None]:
print(X_test)

# Training the ML model
Logistic Regression

In [None]:
model = LogisticRegression(max_iter=1000)


In [None]:
model.fit(X_train, Y_train)

# MODEL EVALUATION
Accuracy score

In [None]:
# accuracy score on the training data
X_train_predict = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_predict)

In [None]:
print('Accuracy on training data : ', training_data_accuracy)

In [None]:
# accuracy score on the test data
X_test_predict = model.predict(X_test)  
test_data_accuracy = accuracy_score(Y_test, X_test_predict)

In [None]:
print('Accuracy on test data : ', test_data_accuracy)

# Saving the trained model

In [None]:
import pickle

In [None]:
filename = 'trained_model.sav'
pickle.dump(model, open(filename, 'wb'))

# Using the saved model for future predictions

In [None]:
loaded_model = pickle.load(open(filename, 'rb'))


In [None]:
X_new = X_test[200]
print(Y_test[200])

prediction = model.predict(X_new)
print(prediction)

if prediction[0] == 1:
    print("The tweet is positive") 
else:
    print("The tweet is negative")