In [1]:
import numpy as np
import pandas as pd
import re
import pickle
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/amey/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [4]:
# Loading the data from csv to pandas dataframe

twitter_data = pd.read_csv('twitter_sentiment_dataset.csv', header=None, encoding = 'ISO-8859-1')

In [5]:
twitter_data.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [6]:
# Naming the columns in the dataset

twitter_data.columns = ['target', 'ids','date', 'flag', 'user', 'text']

In [7]:
twitter_data.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [8]:
twitter_data.tail()

Unnamed: 0,target,ids,date,flag,user,text
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...
1599999,4,2193602129,Tue Jun 16 08:40:50 PDT 2009,NO_QUERY,RyanTrevMorris,happy #charitytuesday @theNSPCC @SparksCharity...


In [9]:
twitter_data.shape

(1600000, 6)

In [10]:
# Checking the missing values in the dataset

twitter_data.isnull().sum()

target    0
ids       0
date      0
flag      0
user      0
text      0
dtype: int64

In [11]:
# Checking the trends in the target column

twitter_data['target'].value_counts()

0    800000
4    800000
Name: target, dtype: int64

In [12]:
# Changing the value of 4(positive) lables to 1 in the target column

twitter_data.replace({'target':{4:1}}, inplace=True)

In [13]:
twitter_data['target'].value_counts()

0    800000
1    800000
Name: target, dtype: int64

In [14]:
# Stemming

stemmer = PorterStemmer()

In [15]:
def stemming(tweet):
    # Removing all the special characters, numbers and punctuations from the tweet
    stemmed_tweet = re.sub('[^a-zA-Z]', ' ', tweet)
    
    # Converting to lowercase 
    stemmed_tweet = stemmed_tweet.lower()
    
    # Splitting all the text of the tweet and adding them to a list
    stemmed_tweet = stemmed_tweet.split()
    
    # Performing the stemming operation if the word is not a stop word
    stemmed_tweet = [stemmer.stem(word) for word in stemmed_tweet if not word in stopwords.words('english')]
    
    # Joining the words in the tweet again
    stemmed_tweet = ' '.join(stemmed_tweet)
    
    return stemmed_tweet

In [16]:
twitter_data['stemmed_tweet'] = twitter_data['text'].apply(stemming)

In [17]:
twitter_data.head()

Unnamed: 0,target,ids,date,flag,user,text,stemmed_tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot http twitpic com zl awww bummer sho...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset updat facebook text might cri result sch...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kenichan dive mani time ball manag save rest g...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclass behav mad see


In [18]:
print(twitter_data['stemmed_tweet'])

0          switchfoot http twitpic com zl awww bummer sho...
1          upset updat facebook text might cri result sch...
2          kenichan dive mani time ball manag save rest g...
3                            whole bodi feel itchi like fire
4                              nationwideclass behav mad see
                                 ...                        
1599995                           woke school best feel ever
1599996    thewdb com cool hear old walt interview http b...
1599997                         readi mojo makeov ask detail
1599998    happi th birthday boo alll time tupac amaru sh...
1599999    happi charitytuesday thenspcc sparkschar speak...
Name: stemmed_tweet, Length: 1600000, dtype: object


In [19]:
# Separating the data into features and labels

X = twitter_data['stemmed_tweet'].values
Y = twitter_data['target'].values

In [20]:
print(X)

['switchfoot http twitpic com zl awww bummer shoulda got david carr third day'
 'upset updat facebook text might cri result school today also blah'
 'kenichan dive mani time ball manag save rest go bound' ...
 'readi mojo makeov ask detail'
 'happi th birthday boo alll time tupac amaru shakur'
 'happi charitytuesday thenspcc sparkschar speakinguph h']


In [21]:
print(Y)

[0 0 0 ... 1 1 1]


In [22]:
# Splitting the data into train and test sets

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [23]:
# Converting textual data to numerical data 

vectorizer = TfidfVectorizer()

XV_train = vectorizer.fit_transform(X_train)
XV_test = vectorizer.transform(X_test)

In [24]:
# Creating the logistic regression model

LR_model = LogisticRegression(max_iter=1000)

In [25]:
# Training the model

LR_model.fit(XV_train, Y_train)

LogisticRegression(max_iter=1000)

In [26]:
# Accuracy score on the training data

XV_train_pred = LR_model.predict(XV_train)
training_data_accuracy = accuracy_score(Y_train, XV_train_pred)

print(training_data_accuracy)

0.81018515625


In [27]:
# Accuracy score on the test data

XV_test_pred = LR_model.predict(XV_test)
test_data_accuracy = accuracy_score(Y_test, XV_test_pred)

print(test_data_accuracy)

0.778003125


In [28]:
# Saving the trained model using pickle

filename = 'trained_model.sav'
pickle.dump(LR_model, open(filename, 'wb'))

In [29]:
# Using the saved model for predictions

# Loading the saved model
loaded_model = pickle.load(open('trained_model.sav', 'rb'))

In [30]:
def output(n):
    if n == 0:
        return "It is a negative tweet."
    elif n == 1:
        return "It is a positive tweet."

In [31]:
def manual_test(tweet):
    # Corrected syntax for defining dictionary
    test_tweet = {"test_tweet": [tweet]}
    
    # Converting the news from dictionary to DataFrame
    new_def_test = pd.DataFrame(test_tweet)
    
    # Formatting the text of the news using the regular expression functions
    new_def_test["test_tweet"] = new_def_test["test_tweet"].apply(stemming)
    new_x_test = new_def_test["test_tweet"]
    
    # Vectorizing the news to be tested
    new_xv_test = vectorizer.transform(new_x_test)
    
    # Prediction of the test news with all the algorithms
    pred_lr = LR_model.predict(new_xv_test)
    
    result = (
        f"LR Prediction: {output(pred_lr[0])}"
    )
    
    return result

In [32]:
test_tweet = str(input())

Things are about to change for you. May the overthinking, and the doubt exit your mind right now. May clarity replace confusion. May peace and calmness fill your life. You’ve been strong long enough, it’s time to start receiving your blessings. You deserve it.


In [33]:
result = manual_test(test_tweet)
print(result)

LR Prediction: It is a positive tweet.
