# Twitter sentiment analysis

In [None]:
import pandas as pd

COLUMNS  = ['target', 'id', 'date', 'flag', 'user', 'text']

# Reading data with appropriate encoding
data = pd.read_csv('twitter_data.csv', encoding='iso8859_2', names=COLUMNS)

In [None]:
# What the data looks like
data.head()

### Types and shape

In [None]:
data.dtypes

In [None]:
data.shape

### Target value

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.countplot(data['target'], order=data['target'].value_counts().index)
plt.show()

The variable is balanced.

In [None]:
sns.countplot(data['flag'], order=data['flag'].value_counts().index)
plt.show()

In [None]:
text, target = list(data['text']), list(data['target'])

In [None]:
import re

pattern_nick = re.compile(r'@[^\s]+')
pattern_http = re.compile(r'http:\/\/[^ ]+')
pattern_www = re.compile(r'www\.[^ ]+')
pattern_number = re.compile(r'[0-9]+')
pattern_signs = re.compile(r'[%,.!?-]')
pattern_letters = re.compile(r'[^a-zA-Z]')

In [None]:
from nltk.stem import WordNetLemmatizer 
import nltk

lemmatizer = WordNetLemmatizer() 

In [None]:
from nltk.stem import PorterStemmer 

ps = PorterStemmer()

In [None]:
from nltk.corpus import stopwords

print(stopwords.words('english'))

In [None]:
tweets = []

for tweet in text:
    tweet = tweet.lower()
    tweet = re.sub(pattern_nick, 'nick', tweet)
    tweet = re.sub(pattern_http, 'www', tweet)
    tweet = re.sub(pattern_www, 'www', tweet)
    tweet = re.sub(pattern_number, '', tweet)
    tweet = re.sub(pattern_signs, '', tweet)
    
    tweet = tweet.split()
    lemmatizer_tweet = ''
    for word in tweet: 
        if word not in set(stopwords.words('english')):
            lemmatizer_word = lemmatizer.lemmatize(word)
            stem_word = ps.stem(lemmatizer_word)
            lemmatizer_tweet += stem_word + ' '
    tweets.append(lemmatizer_tweet)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(tweets, target, test_size = 0.2, random_state = 0)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectoriser = TfidfVectorizer()
vectoriser.fit(X_train)

In [None]:
X_train = vectoriser.transform(X_train)
X_test  = vectoriser.transform(X_test)

In [None]:
from sklearn.naive_bayes import BernoulliNB

BNBmodel = BernoulliNB(alpha = 2)
BNBmodel.fit(X_train, y_train)

In [None]:
y_pred = BNBmodel.predict(X_test)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

In [None]:
tekst = ['I hate monday', 'good']

tekst  = vectoriser.transform(tekst)
BNBmodel.predict(tekst)