<a href="https://colab.research.google.com/github/akash-joshi/twitter-sentiment/blob/master/twitter_sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!wget -c https://raw.githubusercontent.com/crwong/cs224u-project/master/data/sentiment/training.1600000.processed.noemoticon.csv

--2019-01-29 05:15:49--  https://raw.githubusercontent.com/crwong/cs224u-project/master/data/sentiment/training.1600000.processed.noemoticon.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2989873 (2.9M) [text/plain]
Saving to: ‘training.1600000.processed.noemoticon.csv’


2019-01-29 05:15:54 (42.4 MB/s) - ‘training.1600000.processed.noemoticon.csv’ saved [2989873/2989873]



In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python

import numpy as np
import pandas as pd
import re


def import_tweets(filename, header = None):
	#import data from csv file via pandas library
	tweet_dataset = pd.read_csv(filename, encoding = 'latin-1', header = header)
	#the column names are based on sentiment140 dataset provided on kaggle
	tweet_dataset.columns = ['sentiment','id','date','flag','user','text']
	#delete 3 columns: flags,id,user, as they are not required for analysis
	for i in ['flag','id','user','date']: del tweet_dataset[i] # or tweet_dataset = tweet_dataset.drop(["id","user","date","user"], axis = 1)
	#in sentiment140 dataset, positive = 4, negative = 0; So we change positive to 1
	tweet_dataset.sentiment = tweet_dataset.sentiment.replace(4,1)
	return tweet_dataset
	
def preprocess_tweet(tweet):
	#Preprocess the text in a single tweet
	#arguments: tweet = a single tweet in form of string 
	#convert the tweet to lower case
	tweet.lower()
	#convert all urls to sting "URL"
	tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
	#convert all @username to "AT_USER"
	tweet = re.sub('@[^\s]+','AT_USER', tweet)
	#correct all multiple white spaces to a single white space
	tweet = re.sub('[\s]+', ' ', tweet)
	#convert "#topic" to just "topic"
	tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
	return tweet

def feature_extraction(data, method = "tfidf"):
	#arguments: data = all the tweets in the form of array, method = type of feature extracter
	#methods of feature extractions: "tfidf" and "doc2vec"
	if method == "tfidf":
		from sklearn.feature_extraction.text import TfidfVectorizer
		tfv=TfidfVectorizer(sublinear_tf=True, stop_words = "english") # we need to give proper stopwords list for better performance
		features=tfv.fit_transform(data)
	else:
		return "Incorrect inputs"
	return features

#apply the preprocess function for all the tweets in the dataset
tweet_dataset = import_tweets("training.1600000.processed.noemoticon.csv")
tweet_dataset['text'] = tweet_dataset['text'].apply(preprocess_tweet)
data = np.array(tweet_dataset.text)
label = np.array(tweet_dataset.sentiment)


from sklearn.model_selection import train_test_split

X_train , X_test , ytrain , ytest = train_test_split(data,label,test_size=0.2)

features = feature_extraction(data, method = "tfidf") #1600000x288571 sparse matrix of type 'numpy.float64


from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
#fit model to data
model.fit(features, label)
#make prediction on the same (train) data

predictions = model.predict(features)

print(accuracy_score(label,predictions))

0.878
