In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/tweets-sentiment-analysis/vocab.py
/kaggle/input/tweets-sentiment-analysis/vocab.json
/kaggle/input/tweets-sentiment-analysis/train_data.csv
/kaggle/input/tweets-sentiment-analysis/test_data.csv
/kaggle/input/tweets-sentiment-analysis/Exploring Data.ipynb


In [2]:


from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from tqdm import tqdm
import multiprocessing as mp


In [3]:
path = "/kaggle/input/tweets-sentiment-analysis/train_data.csv"
#path2= "/kaggle/input/tweets-sentiment-analysis/test_data.csv"
twitter_data = pd.read_csv(path)
#test_data= pd.read_csv(path)
twitter_data.index.name = "index"
twitter_data.head()

Unnamed: 0_level_0,sentence,sentiment
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,awww that s a bummer you shoulda got david car...,0
1,is upset that he can t update his facebook by ...,0
2,i dived many times for the ball managed to sav...,0
3,my whole body feels itchy and like its on fire,0
4,no it s not behaving at all i m mad why am i h...,0


In [4]:
stopwordsList = set(ENGLISH_STOP_WORDS)
print(list(stopwordsList)[:10])
clean_pattern = re.compile(r'[^a-zA-Z]')#^~ means removing everything that is not a letter.


['please', 'made', 'alone', 'herself', 'yours', 'elsewhere', 'call', 'onto', 'amongst', 'hundred']


In [5]:
twitter_data.shape


(1523975, 2)

In [6]:
twitter_data.isnull().sum()

sentence     0
sentiment    0
dtype: int64

In [7]:
twitter_data['sentiment'].value_counts()

sentiment
0    767059
1    756916
Name: count, dtype: int64

In [8]:
portStemmer= PorterStemmer() #stemming is used to reduce a word to its root word.

In [9]:
def stemming(content):
    stemmedContent = clean_pattern.sub(' ', content) 
    stemmedContent = stemmedContent.lower()
    stemmedContent = stemmedContent.split() #splitting the tweet into lit of words
    stemmedContent = [portStemmer.stem(word) for word in stemmedContent if not word in stopwordsList]
    stemmedContent = ' '.join(stemmedContent)

    return stemmedContent


In [10]:
def parallel_stemming(data, num_workers=mp.cpu_count()):
    with mp.Pool(num_workers) as pool:
        results = list(tqdm(pool.imap(stemming, data), total=len(data)))  # Show progress bar
    return results

In [11]:
twitter_data['stemmedContent'] = parallel_stemming(twitter_data['sentence'])


100%|██████████| 1523975/1523975 [03:03<00:00, 8285.68it/s]


In [12]:
twitter_data.head()

Unnamed: 0_level_0,sentence,sentiment,stemmedContent
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,awww that s a bummer you shoulda got david car...,0,awww s bummer shoulda got david carr day d
1,is upset that he can t update his facebook by ...,0,upset t updat facebook text result school toda...
2,i dived many times for the ball managed to sav...,0,dive time ball manag save rest bound
3,my whole body feels itchy and like its on fire,0,bodi feel itchi like
4,no it s not behaving at all i m mad why am i h...,0,s behav m mad t


In [13]:
X= twitter_data['stemmedContent'].values
y= twitter_data['sentiment'].values

In [14]:
print(X)

['awww s bummer shoulda got david carr day d'
 'upset t updat facebook text result school today blah'
 'dive time ball manag save rest bound' ... 'readi mojo makeov ask detail'
 'happi th birthday boo alll time tupac amaru shakur'
 'happi charitytuesday']


In [15]:
print(y)

[0 0 0 ... 1 1 1]


In [16]:
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size=0.2, stratify=y,random_state=2)

In [17]:
print(X.shape,x_train.shape,x_test.shape)

(1523975,) (1219180,) (304795,)


In [18]:
vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)


In [19]:
print(x_train)
print(x_test)

  (0, 161753)	0.7085850087516308
  (0, 34362)	0.7056254568625281
  (1, 12312)	0.22162047955797365
  (1, 163547)	0.26680301425437103
  (1, 119100)	0.2988612109670705
  (1, 43781)	0.32135146429446165
  (1, 102758)	0.25691653153146793
  (1, 161335)	0.4132289864242559
  (1, 160711)	0.2478527251069974
  (1, 44232)	0.23931456263812517
  (1, 94608)	0.17281005850654177
  (1, 44213)	0.48358398662783436
  (1, 1704)	0.2606380757438443
  (2, 89694)	0.36262431684480045
  (2, 159539)	0.5236161267890233
  (2, 10913)	0.6981407332325676
  (2, 94608)	0.32700041773795413
  (3, 85031)	0.3049088403020526
  (3, 22174)	0.25534296241239424
  (3, 122320)	0.4768961521671171
  (3, 183150)	0.2755800797982844
  (3, 98554)	0.29058581546394746
  (3, 72868)	0.49098324253449427
  (3, 133534)	0.2151246428619407
  (3, 179011)	0.40825601986214266
  :	:
  (1219176, 30480)	0.673801903653383
  (1219176, 30486)	0.2906556365825624
  (1219176, 58119)	0.17120621434081468
  (1219176, 169853)	0.20623720834307624
  (1219176, 11795

In [20]:
model = LogisticRegression(max_iter=1000, solver='saga', n_jobs=-1)

with tqdm(total=100, desc="Training Progress", bar_format="{l_bar}{bar} {n_fmt}/{total_fmt} [{elapsed}<{remaining}]") as pbar:
    for i in range(100):  # Simulating incremental progress
        model.fit(x_train, y_train)  # Train the model
        pbar.update(1)

Training Progress: 100%|██████████ 100/100 [37:49<00:00]


In [21]:
y_pred= model.predict(x_train)
training_accuracy = accuracy_score(y_train,y_pred)
print ("Training accuracy: ",training_accuracy)

Training accuracy:  0.7879771649797405


In [22]:
y_pred= model.predict(x_test)
test_accuracy = accuracy_score(y_test,y_pred)
print ("Test Data accuracy: ",test_accuracy)

Test Data accuracy:  0.7700946537836907


In [23]:
import pickle

In [24]:
filename= "tsa_trained_model.sav"
pickle.dump(model, open(filename,'wb'))

In [25]:
#load model
loadedmodel= pickle.load(open('/kaggle/working/tsa_trained_model.sav','rb'))

In [26]:
X_new=x_test[131]
print(y_test[131])

pred= model.predict(X_new)

if (pred[0]==0):
    print('Negative Tweet')
else:
    print('Positive Tweet')

0
Negative Tweet
