In [1]:
import pandas as pd

## Load Dataset

In [2]:
train = pd.read_csv("data/train.tsv", sep="\t", index_col="PhraseId")

print(train.shape)
train.head()

(156060, 3)


Unnamed: 0_level_0,SentenceId,Phrase,Sentiment
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,A series of escapades demonstrating the adage ...,1
2,1,A series of escapades demonstrating the adage ...,2
3,1,A series,2
4,1,A,2
5,1,series,2


In [3]:
test = pd.read_csv("data/test.tsv", sep="\t", index_col="PhraseId")

print(test.shape)
test.head()

(66292, 2)


Unnamed: 0_level_0,SentenceId,Phrase
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
156061,8545,An intermittently pleasing but mostly routine ...
156062,8545,An intermittently pleasing but mostly routine ...
156063,8545,An
156064,8545,intermittently pleasing but mostly routine effort
156065,8545,intermittently pleasing but mostly routine


## Preprocessing

In [4]:
train["Phrase(Origin)"] = train["Phrase"].copy()

print(train.shape)
train[["Phrase", "Phrase(Origin)"]].head()

(156060, 4)


Unnamed: 0_level_0,Phrase,Phrase(Origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
2,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
3,A series,A series
4,A,A
5,series,series


In [5]:
test["Phrase(Origin)"] = test["Phrase"].copy()

print(test.shape)
test[["Phrase", "Phrase(Origin)"]].head()

(66292, 3)


Unnamed: 0_level_0,Phrase,Phrase(Origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
156061,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156062,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156063,An,An
156064,intermittently pleasing but mostly routine effort,intermittently pleasing but mostly routine effort
156065,intermittently pleasing but mostly routine,intermittently pleasing but mostly routine


### Clean Text

In [6]:
def clean_text(phrase):
    phrase = phrase.replace("ca n't", "can not")
    phrase = phrase.replace("n't", "not")
    
    return phrase

train["Phrase"] = train["Phrase"].apply(clean_text)

print(train.shape)
train[["Phrase", "Phrase(Origin)"]].head()

(156060, 4)


Unnamed: 0_level_0,Phrase,Phrase(Origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
2,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
3,A series,A series
4,A,A
5,series,series


In [7]:
test["Phrase"] = test["Phrase"].apply(clean_text)

print(test.shape)
test[["Phrase", "Phrase(Origin)"]].head()

(66292, 3)


Unnamed: 0_level_0,Phrase,Phrase(Origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
156061,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156062,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156063,An,An
156064,intermittently pleasing but mostly routine effort,intermittently pleasing but mostly routine effort
156065,intermittently pleasing but mostly routine,intermittently pleasing but mostly routine


### Stem Phrases

In [43]:
from tqdm import tqdm
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer('english')

def stem_phrase(phrase):
    stemmed_words = [stemmer.stem(w) for w in phrase.split(" ")]
    stemmed_pharse = " ".join(stemmed_words)
    
    return stemmed_pharse

In [44]:
tqdm.pandas(desc="stemming..")
train['Phrase'] = train['Phrase'].progress_apply(stem_phrase)

print(train.shape)
train[["Phrase", "Phrase(Origin)"]].head()

stemming..: 100%|██████████| 156060/156060 [00:21<00:00, 7152.24it/s]


(156060, 4)


Unnamed: 0_level_0,Phrase,Phrase(Origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,a seri of escapad demonstr the adag that what ...,A series of escapades demonstrating the adage ...
2,a seri of escapad demonstr the adag that what ...,A series of escapades demonstrating the adage ...
3,a seri,A series
4,a,A
5,seri,series


In [45]:
tqdm.pandas(desc="stemming..")
test['Phrase'] = test['Phrase'].progress_apply(stem_phrase)

print(test.shape)
test[["Phrase", "Phrase(Origin)"]].head()

stemming..: 100%|██████████| 66292/66292 [00:11<00:00, 5752.88it/s]

(66292, 3)





Unnamed: 0_level_0,Phrase,Phrase(Origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
156061,an intermitt pleas but most routin effort .,An intermittently pleasing but mostly routine ...
156062,an intermitt pleas but most routin effort,An intermittently pleasing but mostly routine ...
156063,an,An
156064,intermitt pleas but most routin effort,intermittently pleasing but mostly routine effort
156065,intermitt pleas but most routin,intermittently pleasing but mostly routine


### Vectorize Phrases

In [98]:
from sklearn.feature_extraction.text import TfidfVectorizer

char_vectorizer = TfidfVectorizer(analyzer='char',
                                  max_features=10000,
                                  ngram_range=(1, 9))
char_vectorizer

TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=1,
        ngram_range=(1, 9), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [99]:
char_vectorizer.fit(train["Phrase"])

TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=1,
        ngram_range=(1, 9), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [100]:
X_train_char = char_vectorizer.transform(train["Phrase"])
X_train_char

<156060x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 18146974 stored elements in Compressed Sparse Row format>

In [101]:
vocabulary = char_vectorizer.get_feature_names()
pd.DataFrame(X_train_char[0:1000].toarray(), columns=vocabulary).head()

Unnamed: 0,Unnamed: 1,','.1,'',''.1,'s,'s.1,'s a,'s a.1,'s b,...,ywoo,ywood,ywood.1,z,z.1,za,ze,zi,zi.1,zz
0,0.25636,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.184616,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.067848,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [102]:
X_test_char = char_vectorizer.transform(test["Phrase"])
X_test_char

<66292x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 7121716 stored elements in Compressed Sparse Row format>

In [89]:
from sklearn.feature_extraction.text import TfidfVectorizer

word_vectorizer = TfidfVectorizer(max_features=30000,
                                  ngram_range=(1, 2))
word_vectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=30000, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [90]:
word_vectorizer.fit(train["Phrase"])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=30000, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [92]:
X_train_word = word_vectorizer.transform(train["Phrase"])
X_train_word

<156060x30000 sparse matrix of type '<class 'numpy.float64'>'
	with 1508198 stored elements in Compressed Sparse Row format>

In [93]:
vocabulary = word_vectorizer.get_feature_names()
pd.DataFrame(X_train_word[0:1000].toarray(), columns=vocabulary).head()

Unnamed: 0,000,000 time,10,10 000,10 minut,10 or,10 second,10 set,10 year,100,...,zish,zish and,zombi,zombi you,zone,zone and,zone arm,zone episod,zucker,zucker brothers
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [94]:
X_test_word = word_vectorizer.transform(test["Phrase"])
X_test_word

<66292x30000 sparse matrix of type '<class 'numpy.float64'>'
	with 499286 stored elements in Compressed Sparse Row format>

In [105]:
from scipy.sparse import hstack

X_train = hstack([X_train_char, X_train_word])
X_train

<156060x40000 sparse matrix of type '<class 'numpy.float64'>'
	with 19655172 stored elements in COOrdinate format>

In [106]:
X_test = hstack([X_test_char, X_test_word])
X_test

<66292x40000 sparse matrix of type '<class 'numpy.float64'>'
	with 7621002 stored elements in COOrdinate format>

In [107]:
y_train = train["Sentiment"]

print(y_train.shape)
y_train.head()

(156060,)


PhraseId
1    1
2    2
3    2
4    2
5    2
Name: Sentiment, dtype: int64

## Train

In [108]:
from sklearn.linear_model import SGDClassifier

model = SGDClassifier(alpha=6.762746e-06,
                      random_state=37)
model

SGDClassifier(alpha=6.762746e-06, average=False, class_weight=None,
       epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=37, shuffle=True,
       tol=None, verbose=0, warm_start=False)

### Score

In [109]:
from sklearn.model_selection import cross_val_predict

y_predict = cross_val_predict(model, X_train, y_train, cv=5)

print(y_predict.shape)
y_predict



(156060,)


array([3, 3, 2, ..., 2, 2, 2])

In [110]:
from sklearn.metrics import accuracy_score

score = accuracy_score(y_train, y_predict)

print("Score = {0:.6f}".format(score))

Score = 0.582276


In [111]:
import numpy as np

result = train.copy()
result["Sentiment(Predict)"] = y_predict

result["Distance"] = np.abs(result["Sentiment"] - result["Sentiment(Predict)"])

result = result.sort_values(by="Distance", ascending=False)

print(result.shape)
result.head()

(156060, 6)


Unnamed: 0_level_0,SentenceId,Phrase,Sentiment,Phrase(Origin),Sentiment(Predict),Distance
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
56607,2843,'s never a dull moment in the giant spider inv...,4,'s never a dull moment in the giant spider inv...,0,4
80444,4144,you wonder what anyon saw in this film that al...,0,you wonder what anyone saw in this film that a...,4,4
25874,1185,never lose it abil to shock and amaz .,4,never loses its ability to shock and amaze .,0,4
138674,7515,was so uninspir that even a stori immers in lo...,0,was so uninspiring that even a story immersed ...,4,4
7024,281,a scummi ripoff of david cronenberg 's brillia...,0,a scummy ripoff of David Cronenberg 's brillia...,4,4


In [112]:
result.to_csv("result.csv")

In [113]:
vocabulary = vectorizer.get_feature_names()
pd.DataFrame(vocabulary).to_csv("vocabulary.csv")

In [114]:
model.fit(X_train, y_train)



SGDClassifier(alpha=6.762746e-06, average=False, class_weight=None,
       epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=37, shuffle=True,
       tol=None, verbose=0, warm_start=False)

In [115]:
predictions = model.predict(X_test)

print(predictions.shape)
predictions

(66292,)


array([3, 3, 2, ..., 2, 2, 1])

### Submit

In [116]:
submit = pd.read_csv("data/sampleSubmission.csv", index_col="PhraseId")

print(submit.shape)
submit.head()

(66292, 1)


Unnamed: 0_level_0,Sentiment
PhraseId,Unnamed: 1_level_1
156061,2
156062,2
156063,2
156064,2
156065,2


In [117]:
submit["Sentiment"] = predictions

print(submit.shape)
submit.head()

(66292, 1)


Unnamed: 0_level_0,Sentiment
PhraseId,Unnamed: 1_level_1
156061,3
156062,3
156063,2
156064,3
156065,3


In [118]:
submit.to_csv("data/baseline-script.csv")

In [41]:
# from nltk import pos_tag
# from nltk.tokenize import word_tokenize
# from nltk.stem.snowball import SnowballStemmer
# from nltk.stem import WordNetLemmatizer

# def find_pos(pos):
#     if "V" in pos:
#         return "v"
#     else:
#         return "n"

# stemmer = SnowballStemmer('english')
# lemmatizer = WordNetLemmatizer()

# phrase = train.loc[1]["Phrase"]

# words = word_tokenize(phrase)

# words_pos = pos_tag(words)

# cleaned_words = [lemmatizer.lemmatize(w, find_pos(p)) for w, p in words_pos]
# cleaned_words = [stemmer.stem(w) for w in cleaned_words]

# " ".join(cleaned_words)