In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## **Importing the dataset**

In [None]:
import pandas as pd
columns = ['sentiment_rating','id','date','query','user','text']
df = pd.read_csv("/content/drive/MyDrive/Datasets/data.csv",encoding='latin-1',names = columns)
df.head(5)

Unnamed: 0,sentiment_rating,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


## **Analysis & Preprocessing of the dataset**

In [None]:
df.shape

(1600000, 6)

In [None]:
df['query'].value_counts()

NO_QUERY    1600000
Name: query, dtype: int64

In [None]:
df.drop(labels = ['query','date','id'],axis = 1,inplace = True)
df.head(5)

Unnamed: 0,sentiment_rating,user,text
0,0,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,scotthamilton,is upset that he can't update his Facebook by ...
2,0,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,ElleCTF,my whole body feels itchy and like its on fire
4,0,Karoli,"@nationwideclass no, it's not behaving at all...."


In [None]:
#For Links
df['text'] = df['text'].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)

#For Symbols
pattern = r'[^\w\s]'
df['text'] = df['text'].replace(pattern, '',regex = True)
df['text'] = df['text'].replace('_', '', regex = True)
df['user'] = df['user'].replace('_', '', regex = True)
df.head(50)

Unnamed: 0,sentiment_rating,user,text
0,0,TheSpecialOne,switchfoot A thats a bummer You shoulda got...
1,0,scotthamilton,is upset that he cant update his Facebook by t...
2,0,mattycus,Kenichan I dived many times for the ball Manag...
3,0,ElleCTF,my whole body feels itchy and like its on fire
4,0,Karoli,nationwideclass no its not behaving at all im ...
5,0,joywolf,Kwesidei not the whole crew
6,0,mybirch,Need a hug
7,0,coZZ,LOLTrish hey long time no see Yes Rains a bit...
8,0,2Hood4Hollywood,TatianaK nope they didnt have it
9,0,mimismo,twittera que me muera


## **Stemming**

In [None]:
from nltk.stem import PorterStemmer
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
port_stem = PorterStemmer()

In [None]:
def stemming(content):
  stemmed_content = content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)

  return stemmed_content

In [None]:
#df['stemmed_content'] = df['text'].apply(stemming)

## **Classifying labels**

In [None]:
df['sentiment_rating'].value_counts()

0    800000
4    800000
Name: sentiment_rating, dtype: int64

In [None]:
#On the basis of this, two classifications are possible, one negative and one positive
df['sentiment_rating'] = df['sentiment_rating'].replace(4,1,regex = True)
df.head(5)

Unnamed: 0,sentiment_rating,user,text
0,0,TheSpecialOne,switchfoot A thats a bummer You shoulda got...
1,0,scotthamilton,is upset that he cant update his Facebook by t...
2,0,mattycus,Kenichan I dived many times for the ball Manag...
3,0,ElleCTF,my whole body feels itchy and like its on fire
4,0,Karoli,nationwideclass no its not behaving at all im ...


## **Splitting the dataset**

In [None]:
X = df['text'].values
y = df['sentiment_rating'].values

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
import warnings
warnings.filterwarnings("ignore")

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

(1600000,) (1280000,) (320000,)


## **Feature Extraction(Converting to Numerical Values)**

In [None]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

## **Logistic Regression Model**

In [None]:
model = LogisticRegression()
model.fit(X_train,y_train)

In [None]:
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(y_train, X_train_prediction)
training_data_precision = precision_score(y_train,X_train_prediction)
training_data_recall = recall_score(y_train,X_train_prediction)
training_data_f1 = f1_score(y_train,X_train_prediction)

X_test_prediction = model.predict(X_test)
testing_data_accuracy = accuracy_score(y_test, X_test_prediction)
testing_data_precision = precision_score(y_test,X_test_prediction)
testing_data_recall = recall_score(y_test,X_test_prediction)
testing_data_f1 = f1_score(y_test,X_test_prediction)

In [None]:
print(f"Testing Data Accuracy: {testing_data_accuracy} \nTraining Data Accuracy: {training_data_accuracy}")
print(f"\nTesting Data Precision: {testing_data_precision} \nTraining Data Precision: {training_data_precision}")
print(f"\nTesting Data Recall: {testing_data_recall} \nTraining Data Recall: {training_data_recall}")
print(f"\nTesting Data f1: {testing_data_f1} \nTraining Data f1: {training_data_f1}")

Testing Data Accuracy: 0.7986875 
Training Data Accuracy: 0.81021953125

Testing Data Precision: 0.7931319894055326 
Training Data Precision: 0.7991702420403667

Testing Data Recall: 0.8083065695254995 
Training Data Recall: 0.8286528178411708

Testing Data f1: 0.8006473853923614 
Training Data f1: 0.8136445407644656


In [None]:
X_data = X_test[150]
print(y_test[150])
prediction = model.predict(X_data)
print(prediction)

1
[1]


## **Testing the model on custom data**

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
def preprocess_text(text):
    text = text.lower()
    text = text.split()
    text = [word for word in text if not word in set(stopwords.words('english'))]
    text = ' '.join(text)
    return text

def predict_sentiment(input_text):
    input_text = preprocess_text(input_text)
    input_vectorized = vectorizer.transform([input_text])
    prediction = model.predict(input_vectorized)

    if prediction[0] == 0:
        return 'Negative Tweet'
    else:
        return 'Positive Tweet'

In [None]:
user_input = input("Enter a tweet: ")
result = predict_sentiment(user_input)
print("Prediction:", result)

Enter a tweet: you are a black piece of shit
Prediction: Negative Tweet
