# Sentiment Analysis on Twitter using Random Forest Classifier

## Import Dataset

In [None]:
import re
import nltk
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [1]:
%matplotlib inline

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
DATASET_FILEPATH = "./drive/My Drive/Datasets/Twitter Sentiment Analysis Dataset/tweets.csv"
raw_data = pd.read_csv(DATASET_FILEPATH, encoding="ISO-8859-1")

raw_data = shuffle(raw_data)
raw_data.head()

## Extracting Data

### Extracting Tweets

In [None]:
tweets = raw_data.iloc[:, 5].tolist()

### Extracting Labels

In [None]:
labels = raw_data.iloc[:, 0]
labels.value_counts().plot(kind='bar')

labels = labels.tolist()
labels = [label if label == 0 else 1 for label in labels]

## Preprocessing

### Convert to Lowercase

In [None]:
tweets = [tweet.lower() for tweet in tweets]

### Remove Punctuations

In [None]:
tweets = [re.sub(r"[^A-Za-z']", " ", tweet) for tweet in tweets]

### Remove Numbers

In [None]:
tweets = [re.sub('[0-9]', " ", tweet) for tweet in tweets]

### Remove Stopwords

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

In [None]:
tweets = [(" ".join([ word for word in tweet.split() if word not in stopwords ])) for tweet in tweets]

### Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

In [None]:
lemmatizer = WordNetLemmatizer()
tweets = [(" ".join([ lemmatizer.lemmatize(word) for word in tweet.split() ])) for tweet in tweets]

## Truncate Data

In [None]:
tweets = tweets[:100000]
labels = labels[:100000]

## TF-IDF Encoding

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf_transformer = TfidfVectorizer(max_features=500, min_df=5, max_df=0.7)
X = tfidf_transformer.fit_transform(tweets).toarray()

In [None]:
print("Training Dataset Shape: ", tweets.shape)
print("Training Label Shape: ", len(labels))

## Split Dataset

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(tweets, labels, test_size=0.2)

In [None]:
print("X_train Shape: ", X_train.shape)
print("X_test Shape: ", X_test.shape)

In [None]:
print("Y_train Shape: ", len(Y_train))
print("Y_test Shape: ", len(Y_test))

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, Y_train) 

## Evaliate Model

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
Y_pred = classifier.predict(X_test)

### Confusion Matrix

In [None]:
print(confusion_matrix(Y_test, Y_pred))

### Classification Report

In [None]:
print(classification_report(Y_test, Y_pred))

### Accuracy Score

In [None]:
print(accuracy_score(Y_test, Y_pred))