In [35]:
import pandas as pd

load dataset


In [36]:
data = pd.read_csv('/content/tweets.csv')

In [37]:
data.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


In [38]:
data.shape

(7920, 3)

Text preprocessing

In [39]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7920 entries, 0 to 7919
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      7920 non-null   int64 
 1   label   7920 non-null   int64 
 2   tweet   7920 non-null   object
dtypes: int64(2), object(1)
memory usage: 185.8+ KB


In [40]:
data.isna().sum()# checking missing values

Unnamed: 0,0
id,0
label,0
tweet,0


In [41]:
data.columns

Index(['id', 'label', 'tweet'], dtype='object')

In [42]:
data['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,5894
1,2026


Text cleaning


In [43]:
import nltk
import numpy as np
import string
nltk.download('stopwords')
from nltk.corpus import stopwords
import re

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [44]:
stop_words = set(stopwords.words('english'))

def clean_tweet(tweet):
    #convert to lowercase
    tweet = tweet.lower()
    #remove urls
    tweet = re.sub(r'http\S+', '', tweet)
    #remove mentions
    tweet = re.sub(r'@\w+', '', tweet)
    #remove hashtags
    tweet = re.sub(r'#\w+', '', tweet)
    #remove numbers
    tweet = re.sub(r'\d+', '', tweet)
    #remove special characters
    tweet = re.sub(r'[^\w\s]', '', tweet)
    #remove punctuations
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    #remove stopwords
    tweet = ' '.join([word for word in tweet.split() if word not in stop_words])
    return tweet

data['cleaned_tweet'] = data['tweet'].apply(clean_tweet)

In [45]:
data.head()

Unnamed: 0,id,label,tweet,cleaned_tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...,test
1,2,0,Finally a transparant silicon case ^^ Thanks t...,finally transparant silicon case thanks uncle
2,3,0,We love this! Would you go? #talk #makememorie...,love would go
3,4,0,I'm wired I know I'm George I was made that wa...,im wired know im george made way
4,5,1,What amazing service! Apple won't even talk to...,amazing service apple wont even talk question ...


train test split

In [46]:
from sklearn.model_selection import train_test_split
x = data['cleaned_tweet']
y = data['label']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

text vectorization

In [47]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
#using tf and idf
tfidf_vectorizer = TfidfVectorizer()
x_train_tfidf = tfidf_vectorizer.fit_transform(x_train)
x_test_tfidf = tfidf_vectorizer.transform(x_test)


model training


In [48]:
#using logistic regression model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(x_train_tfidf,y_train)


In [49]:
y_pred = model.predict(x_test_tfidf)


Evaluation of model

In [50]:
# checking accuracy and confusion matrix
from sklearn.metrics import accuracy_score,confusion_matrix
print('accuracy: ',accuracy_score(y_test,y_pred))
print('confusion matrix:\n ',confusion_matrix(y_test,y_pred))

accuracy:  0.8345959595959596
confusion matrix:
  [[1098   54]
 [ 208  224]]


 MSE and Rsquare value


In [56]:
#check overfitting in linear regression
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print('Mean Squared Error:', mse)
print('R-squared:', r2)


Mean Squared Error: 0.16540404040404041
R-squared: 0.1660879629629628
