## Importing Libraries

In [11]:
#Data analysis
import pandas as pd
import numpy as np

## Importing Data from csv files

In [12]:
#Independent Variable
train_tweet_data = pd.read_csv('train.csv').iloc[:,1]

#Dependent Variable
train_sentiments = pd.read_csv('train.csv').iloc[:,5]

#Data in which we have to predict values
test_tweet_data = pd.read_csv('test.csv').iloc[:,0:2]

## Funciton for Cleaning Data
    
    The given datasets are comprised of very much unstructured tweets which should be preprocessed to make an NLP model. In this project, we tried out the following techniques of preprocessing the raw data. But the preprocessing techniques is not limited.
        --Removal of punctuations.
        --Removal of commonly used words (stopwords).
        --Normalization of words(Stemming).

In [13]:
#Importing Library for Data Cleaning 
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

#function for Data Cleaning
def cleaned_tweets(tweet_data):
    
    corpus = []
    for tweets in tweet_data:
        
        cleaned_tweets = re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweets)
        cleaned_tweets = cleaned_tweets.lower().split()
        ps = PorterStemmer()
        
        all_stopwords = stopwords.words('english')
        all_stopwords.remove('not') 
        tweets_words = [ps.stem(word) for word in cleaned_tweets if not word in set(all_stopwords)]
        
        #joining all cleaned words and making list of it
        cleaned_tweets = ' '.join(tweets_words)             
        corpus.append(cleaned_tweets)
        
    return corpus

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Bag-of-Words Model

    Bag-of-words model is a way of extracting features from text for use in modeling.

In [14]:
#Importing library for Bag of words Model 
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

#making X and y matrix: X contains corpus (dependent variable) 
#y contains sentiments 1 (positive), -1 (Negative) , 0 (Neutral)
X = cv.fit_transform(cleaned_tweets(train_tweet_data)).toarray()
y = train_sentiments.values

## Dimensionality Reduction
    
    Input data is high dimensional data, so reducing the dimensionality by projecting the data to a lower dimensional       subspace which captures the “essence” of the data. 

In [15]:
#Dimensionality Reduction
from sklearn.decomposition import KernelPCA
kpca = KernelPCA( kernel = 'rbf')
X = kpca.fit_transform(X)

## Fitting Random Forest Model

In [16]:
#Model Selection
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators =10, criterion = 'entropy',random_state=1)
classifier.fit(X, y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

## Cleaning Test Data and Preparing Test Data for Predicting sentiments from the tweets

In [17]:
#Cleaning Test Data
test_X = cleaned_tweets(test_tweet_data.iloc[:,1])

#extracting features from test data using Bag of Word Model and converting into numpy array
test_X = cv.transform(test_X).toarray()

#Applying Dimensionality reduction to Test Data
test_X = kpca.transform(test_X)

#Predicting sentiments 1 (positive), -1 (Negative) , 0 (Neutral) for test data represented by y_new
y_new = classifier.predict(test_X)

## Converting data into submission format 

In [18]:
#Adding ID and sentiments columnwise and making dataframe of it
predicted_sentiments_test_data = np.column_stack((test_tweet_data.iloc[:,0],y_new))
predicted_sentiments_test_data= pd.DataFrame(predicted_sentiments_test_data,columns=["id",'sentiment_class'])
print(predicted_sentiments_test_data)

                id  sentiment_class
0     1.246628e+18              0.0
1     1.245898e+18             -1.0
2     1.244717e+18             -1.0
3     1.245730e+18              1.0
4     1.244636e+18              0.0
...            ...              ...
1382  1.245219e+18              0.0
1383  1.245882e+18             -1.0
1384  1.246461e+18              0.0
1385  1.246245e+18              0.0
1386  1.245178e+18              0.0

[1387 rows x 2 columns]


## Saving predicted sentiments to csv file

In [19]:
predicted_sentiments_test_data.to_csv('Twitter_sentiments_Analysis.csv',index=False,columns=["id",'sentiment_class'])