In [1]:
!pip install kaggle



In [2]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [3]:
!kaggle datasets  download -d kazanova/sentiment140

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
Downloading sentiment140.zip to /content
 93% 75.0M/80.9M [00:00<00:00, 187MB/s]
100% 80.9M/80.9M [00:00<00:00, 164MB/s]


In [4]:
!unzip sentiment140.zip

Archive:  sentiment140.zip
  inflating: training.1600000.processed.noemoticon.csv  


In [23]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

In [6]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [13]:
data = pd.read_csv('training.csv',encoding = 'ISO-8859-1')
data.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [14]:
data.shape

(1599999, 6)

In [15]:
column_names = ['target','ids','date','flag','user','text']
data.columns = column_names
data.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [16]:
data.isnull().sum()

Unnamed: 0,0
target,0
ids,0
date,0
flag,0
user,0
text,0


In [17]:
data['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
4,800000
0,799999


In [18]:
data.replace({'target':{4:1}}, inplace=True)


In [19]:
data['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
1,800000
0,799999


In [20]:
port_stem = PorterStemmer()

In [24]:
stop_words = set(stopwords.words('english'))

In [25]:
# def stemming(content):
#     stemmed_content = re.sub('[^a-zA-Z]',' ',content)
#     stemmed_content = stemmed_content.lower()
#     stemmed_content = stemmed_content.split()
#     stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
#     stemmed_content = ' '.join(stemmed_content)
#     return stemmed_content
def stemming(content):
    try:
        port_stem = PorterStemmer()  # Instantiate inside the function
        stemmed_content = re.sub('[^a-zA-Z]', ' ', content).lower()
        return ' '.join(port_stem.stem(word) for word in stemmed_content.split() if word not in stop_words)
    except Exception as e:
        print(f"Error processing content: {content}. Error: {e}")
        return ""  # Return an empty string on error

In [26]:
def process_data(df):
    with ThreadPoolExecutor() as executor:
        return list(tqdm(executor.map(stemming, df['text']), total=len(df)))

In [27]:
chunk_size = 50000  # Adjust based on your memory capacity
num_chunks = len(data) // chunk_size + 1
stemmed_contents = []

for i in tqdm(range(num_chunks)):
    start = i * chunk_size
    end = min((i + 1) * chunk_size, len(data))
    chunk = data.iloc[start:end]
    stemmed_chunk = process_data(chunk)
    stemmed_contents.extend(stemmed_chunk)

  0%|          | 0/32 [00:00<?, ?it/s]
  0%|          | 0/50000 [00:00<?, ?it/s][A
100%|██████████| 50000/50000 [00:05<00:00, 9748.47it/s]  
  3%|▎         | 1/32 [00:16<08:21, 16.18s/it]
  0%|          | 0/50000 [00:00<?, ?it/s][A
  8%|▊         | 4002/50000 [00:00<00:01, 36056.90it/s][A
 31%|███       | 15264/50000 [00:00<00:00, 79139.88it/s][A
 69%|██████▉   | 34404/50000 [00:00<00:00, 129044.94it/s][A
100%|██████████| 50000/50000 [00:05<00:00, 9192.16it/s]
  6%|▋         | 2/32 [00:33<08:28, 16.96s/it]
  0%|          | 0/50000 [00:00<?, ?it/s][A
 10%|█         | 5084/50000 [00:00<00:00, 50833.99it/s][A
 20%|██        | 10168/50000 [00:00<00:01, 39367.45it/s][A
 30%|███       | 15094/50000 [00:00<00:00, 40781.44it/s][A
 40%|████      | 20209/50000 [00:00<00:00, 44375.06it/s][A
 50%|████▉     | 24754/50000 [00:00<00:00, 42130.99it/s][A
 58%|█████▊    | 29042/50000 [00:00<00:00, 41474.25it/s][A
 66%|██████▋   | 33235/50000 [00:00<00:00, 37414.66it/s][A
 74%|███████▍  | 37

In [28]:
data['stemmed'] = stemmed_contents

In [29]:
data.head()

Unnamed: 0,target,ids,date,flag,user,text,stemmed
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset updat facebook text might cri result sch...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kenichan dive mani time ball manag save rest g...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclass behav mad see
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew,kwesidei whole crew


In [30]:
X = data['stemmed']
Y = data['target']

In [31]:
X.head()

Unnamed: 0,stemmed
0,upset updat facebook text might cri result sch...
1,kenichan dive mani time ball manag save rest g...
2,whole bodi feel itchi like fire
3,nationwideclass behav mad see
4,kwesidei whole crew


In [32]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y, random_state=2)

In [34]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [35]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, Y_train)

In [36]:
accuracy_score(model.predict(X_train), Y_train)

0.801063125830567

In [37]:
accuracy_score(model.predict(X_test), Y_test)

0.7771625

In [47]:
gm = stemming("Bad morning")
gm = vectorizer.transform([gm])
model.predict(gm)

array([0])

In [45]:
import pickle

In [46]:
pickle.dump(model, open('model.pkl', 'wb'))

In [48]:
model = pickle.load(open('./model.pkl','rb'))

In [60]:

def makePrediction (content):
  gm = stemming(content)
  gm = vectorizer.transform([gm])
  pred = model.predict(gm)

  if pred[0] == 0:
    print('Negative')
  else:
    print('Positive')

In [65]:
makePrediction("yes, lot of business models don't work in india thats why its very difficult to make money here")

Negative
