In [1]:
import pandas as pd
tweets = pd.read_csv("/media/alessandro/storage/Tesi/locale/twitter-sentiment-analysis2/train.csv", encoding='latin-1')
list(tweets.columns.values)

['ItemID', 'Sentiment', 'SentimentText']

In [2]:
tweets.Sentiment.replace(0,'negative', inplace=True)
tweets.Sentiment.replace(1,'positive', inplace=True)

In [4]:
tweets.head()

Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,negative,is so sad for my APL frie...
1,2,negative,I missed the New Moon trail...
2,3,positive,omg its already 7:30 :O
3,4,negative,.. Omgaga. Im sooo im gunna CRy. I'...
4,5,negative,i think mi bf is cheating on me!!! ...


In [5]:
sentiment_counts = tweets.Sentiment.value_counts()
number_of_tweets = tweets.ItemID.count()
print(sentiment_counts)

positive    56457
negative    43532
Name: Sentiment, dtype: int64


In [6]:
import re, nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
wordnet_lemmatizer = WordNetLemmatizer()

def normalizer(tweet):
    only_letters = re.sub("[^a-zA-Z]", " ",tweet) 
    tokens = nltk.word_tokenize(only_letters)[2:]
    lower_case = [l.lower() for l in tokens]
    filtered_result = list(filter(lambda l: l not in stop_words, lower_case))
    lemmas = [wordnet_lemmatizer.lemmatize(t) for t in filtered_result]
    return lemmas

In [8]:
normalizer("Here is text about an airline I like.")

['text', 'airline', 'like']

In [9]:
pd.set_option('display.max_colwidth', -1) # Setting this so we can see the full content of cells
tweets['normalized_tweet'] = tweets.SentimentText.apply(normalizer)
tweets[['SentimentText','normalized_tweet']].head()

Unnamed: 0,SentimentText,normalized_tweet
0,is so sad for my APL friend.............,"[sad, apl, friend]"
1,I missed the New Moon trailer...,"[new, moon, trailer]"
2,omg its already 7:30 :O,[already]
3,.. Omgaga. Im sooo im gunna CRy. I've been at this dentist since 11.. I was suposed 2 just get a crown put on (30mins)...,"[sooo, im, gunna, cry, dentist, since, suposed, get, crown, put, min]"
4,i think mi bf is cheating on me!!! T_T,"[mi, bf, cheating]"


In [10]:
from nltk import ngrams
def ngrams(input_list):
    #onegrams = input_list
    bigrams = [' '.join(t) for t in list(zip(input_list, input_list[1:]))]
    trigrams = [' '.join(t) for t in list(zip(input_list, input_list[1:], input_list[2:]))]
    return bigrams+trigrams
tweets['grams'] = tweets.normalized_tweet.apply(ngrams)
tweets[['grams']].head()

Unnamed: 0,grams
0,"[sad apl, apl friend, sad apl friend]"
1,"[new moon, moon trailer, new moon trailer]"
2,[]
3,"[sooo im, im gunna, gunna cry, cry dentist, dentist since, since suposed, suposed get, get crown, crown put, put min, sooo im gunna, im gunna cry, gunna cry dentist, cry dentist since, dentist since suposed, since suposed get, suposed get crown, get crown put, crown put min]"
4,"[mi bf, bf cheating, mi bf cheating]"


In [11]:
import collections
def count_words(input):
    cnt = collections.Counter()
    for row in input:
        for word in row:
            cnt[word] += 1
    return cnt

In [12]:
tweets[(tweets.Sentiment == 'negative')][['grams']].apply(count_words)['grams'].most_common(20)

[('gon na', 587),
 ('wan na', 531),
 ('wish could', 308),
 ('last night', 300),
 ('twitpic com', 262),
 ('got ta', 223),
 ('sorry hear', 194),
 ('feel better', 187),
 ('bit ly', 186),
 ('look like', 184),
 ('feel like', 182),
 ('http bit', 168),
 ('http bit ly', 166),
 ('na go', 150),
 ('oh well', 132),
 ('want go', 126),
 ('next week', 113),
 ('miss u', 110),
 ('get better', 109),
 ('make sad', 105)]

In [13]:
tweets[(tweets.Sentiment == 'positive')][['grams']].apply(count_words)['grams'].most_common(20)

[('twitpic com', 658),
 ('gon na', 527),
 ('bit ly', 444),
 ('http bit', 395),
 ('http bit ly', 395),
 ('wan na', 307),
 ('good luck', 236),
 ('last night', 221),
 ('tinyurl com', 212),
 ('got ta', 210),
 ('let know', 208),
 ('quot quot', 207),
 ('http tinyurl', 202),
 ('http tinyurl com', 200),
 ('com add', 184),
 ('day using', 182),
 ('using www', 182),
 ('add everyone', 182),
 ('everyone train', 182),
 ('train pay', 182)]

In [14]:
import numpy as np
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(ngram_range=(1,2))

In [15]:
vectorized_data = count_vectorizer.fit_transform(tweets.SentimentText)
indexed_data = hstack((np.array(range(0,vectorized_data.shape[0]))[:,None], vectorized_data))

In [16]:
def sentiment2target(sentiment):
    return {
        'negative': 0,
        'neutral': 1,
        'positive' : 2
    }[sentiment]
targets = tweets.Sentiment.apply(sentiment2target)

In [17]:
from sklearn.model_selection import train_test_split
data_train, data_test, targets_train, targets_test = train_test_split(indexed_data, targets, test_size=0.3, random_state=0)
data_train_index = data_train[:,0]
data_train = data_train[:,1:]
data_test_index = data_test[:,0]
data_test = data_test[:,1:]

In [None]:
from sklearn import svm
from sklearn.multiclass import OneVsRestClassifier
clf = OneVsRestClassifier(svm.SVC(gamma='scale', C=50, probability=True, kernel='linear', verbose=True))
clf_output = clf.fit(data_train, targets_train)

[LibSVM]