In [1]:
import pandas as pd

### Load Dataset

In [2]:
train = pd.read_csv("input/train.tsv", sep="\t", index_col="PhraseId")

print(train.shape)
train.head()

(156060, 3)


Unnamed: 0_level_0,SentenceId,Phrase,Sentiment
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,A series of escapades demonstrating the adage ...,1
2,1,A series of escapades demonstrating the adage ...,2
3,1,A series,2
4,1,A,2
5,1,series,2


In [3]:
test = pd.read_csv("input/test.tsv", sep="\t", index_col="PhraseId")

print(test.shape)
test.head()

(66292, 2)


Unnamed: 0_level_0,SentenceId,Phrase
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
156061,8545,An intermittently pleasing but mostly routine ...
156062,8545,An intermittently pleasing but mostly routine ...
156063,8545,An
156064,8545,intermittently pleasing but mostly routine effort
156065,8545,intermittently pleasing but mostly routine


## Preprocessing

In [4]:
train["Phrase(Origin)"] = train["Phrase"].copy()

print(train.shape)
train[["Phrase", "Phrase(Origin)"]].head()

(156060, 4)


Unnamed: 0_level_0,Phrase,Phrase(Origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
2,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
3,A series,A series
4,A,A
5,series,series


In [5]:
test["Phrase(Origin)"] = test["Phrase"].copy()

print(test.shape)
test[["Phrase", "Phrase(Origin)"]].head()

(66292, 3)


Unnamed: 0_level_0,Phrase,Phrase(Origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
156061,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156062,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156063,An,An
156064,intermittently pleasing but mostly routine effort,intermittently pleasing but mostly routine effort
156065,intermittently pleasing but mostly routine,intermittently pleasing but mostly routine


### Clean Text

In [6]:
import nltk
from tqdm import tqdm
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')

# http://www.nltk.org/api/nltk.stem.html
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

from nltk import pos_tag
from nltk.tokenize import word_tokenize

def stem_phrase(phrase): 
    words = phrase.split(" ")
    stemmed_words = [stemmer.stem(w) for w in words]
    stemmed_phrase = " ".join(stemmed_words)

    return stemmed_phrase


def lemm_phrase(phrase): 
    words = phrase.split(" ")
    stemmed_words = [lemmatizer.lemmatize(w) for w in words]
    stemmed_phrase = " ".join(stemmed_words)

    return stemmed_phrase

def post_tag_phrase(phrase): 
    words = word_tokenize(phrase)
    pos_tag_split_words = pos_tag(words)
    pos_tag_words = ["_".join(w) for w in pos_tag_split_words]
    pos_tag_words = " ".join(pos_tag_words)

    return pos_tag_words

def post_tag_lemm_phrase(phrase): 
    words = word_tokenize(phrase)
    pos_tag_split_words = pos_tag(words)
    pos_tag_words = ["_".join([lemmatizer.lemmatize(o[0]), o[1]]) for o in pos_tag_split_words]
    pos_tag_words = " ".join(pos_tag_words)

    return pos_tag_words

def post_tag_stem_phrase(phrase): 
    words = word_tokenize(phrase)
    pos_tag_split_words = pos_tag(words)
    pos_tag_words = ["_".join([stemmer.stem(o[0]), o[1]]) for o in pos_tag_split_words]
    pos_tag_words = " ".join(pos_tag_words)

    return pos_tag_words

def distributionPoS(tag):
    resultTag = "E"
    if tag.find("V") > -1:
        resultTag = "V" 
    elif tag.find("N") > -1:
        resultTag = "N" 
    else:
        resultTag = tag
    return resultTag

def firstCharLowerPoS(tag):
    resultTag = tag[0]
    resultTag = resultTag.lower()
    
    return resultTag

def postag_lemm_or_stem_phrase(phrase):
    words = []
    posList = ['n', 'v', 'a', 'r']
    for word, tag in pos_tag(word_tokenize(phrase)):
        if firstCharLowerPoS(tag) in posList:
            words.append(lemmatizer.lemmatize(word, pos=firstCharLowerPoS(tag)))
        else:
            words.append(stemmer.stem(word))

    return_phrase = " ".join(words)
   
    return return_phrase

postag_lemm_or_stem_phrase(train.loc[1028]["Phrase"])
postag_lemm_or_stem_phrase('Highly recommended viewing for its courage , ideas , technical proficiency and great acting .')
def encapsulationPoS(tag):
    return tag[0]

def post_tag_lemm_v_or_n_phrase(phrase): 
    words = word_tokenize(phrase)
    pos_tag_split_words = pos_tag(words)
    pos_tag_words = ["_".join([lemmatizer.lemmatize(o[0]), encapsulationPoS(o[1])]) for o in pos_tag_split_words]
    pos_tag_words = " ".join(pos_tag_words)

    return pos_tag_words

def post_tag_lemm_stem_phrase(phrase): 
    words = word_tokenize(phrase)
    pos_tag_split_words = pos_tag(words)
    pos_tag_words = ["_".join([stemmer.stem(lemmatizer.lemmatize(o[0])), o[1]]) for o in pos_tag_split_words]
    pos_tag_words = " ".join(pos_tag_words)

    return pos_tag_words

In [7]:
def clean_text(phrase):
    phrase = phrase.replace("ca n't", "can not")
    phrase = phrase.replace("does n't", "does not")
    phrase = phrase.replace("n't", "not")

    return phrase

def isBe_isDo_isDt_isHv(phrase):
    
    
    phrase = phrase.replace("ca n't", "can not")
    phrase = phrase.replace("does n't", "does not")
    phrase = phrase.replace("n't", "not")

    return phrase

In [8]:
# train["Phrase"] = train["Phrase"].apply(clean_text)
tqdm.pandas(desc="stemming...")
#train["Phrase"] = train["Phrase"].progress_apply(lemm_phrase)
#train["Phrase"] = train["Phrase"].progress_apply(stem_phrase)
train["Phrase"] = train["Phrase"].progress_apply(postag_lemm_or_stem_phrase)

print(train.shape)
train[["Phrase", "Phrase(Origin)"]].head()

stemming...: 100%|██████████| 156060/156060 [01:55<00:00, 1355.60it/s]


(156060, 4)


Unnamed: 0_level_0,Phrase,Phrase(Origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,series of escapade demonstrate adage that what...,A series of escapades demonstrating the adage ...
2,series of escapade demonstrate adage that what...,A series of escapades demonstrating the adage ...
3,series,A series
4,,A
5,series,series


In [9]:
#test["Phrase"] = test["Phrase"].apply(clean_text)
test["Phrase"] = test["Phrase"].progress_apply(postag_lemm_or_stem_phrase)

print(test.shape)
test[["Phrase", "Phrase(Origin)"]].head()

stemming...: 100%|██████████| 66292/66292 [00:46<00:00, 1437.55it/s]

(66292, 3)





Unnamed: 0_level_0,Phrase,Phrase(Origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
156061,intermittently please mostly routin effort .,An intermittently pleasing but mostly routine ...
156062,intermittently please mostly routin effort,An intermittently pleasing but mostly routine ...
156063,,An
156064,intermittently please mostly routin effort,intermittently pleasing but mostly routine effort
156065,intermittently please mostly routin,intermittently pleasing but mostly routine


### Vectorize Phrases

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features=50000, ngram_range=(1, 2))
vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=50000, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [11]:
vectorizer.fit(train["Phrase"])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=50000, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [12]:
X_train = vectorizer.transform(train["Phrase"])
X_train

<156060x50000 sparse matrix of type '<class 'numpy.int64'>'
	with 1408909 stored elements in Compressed Sparse Row format>

In [13]:
vocabulary = vectorizer.get_feature_names()
pd.DataFrame(X_train[0:100].toarray(), columns=vocabulary).head()

Unnamed: 0,10,10 cours,10 set,10 year,100,100 minut,100 year,102,102 minut,103,...,zombie you,zone,zone arm,zone episode,zone leave,zone of,zoolander,zucker,zucker brothers,zwick
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
X_test = vectorizer.transform(test["Phrase"])
X_test

<66292x50000 sparse matrix of type '<class 'numpy.int64'>'
	with 414654 stored elements in Compressed Sparse Row format>

In [15]:
y_train = train["Sentiment"]

print(y_train.shape)
y_train.head()

(156060,)


PhraseId
1    1
2    2
3    2
4    2
5    2
Name: Sentiment, dtype: int64

### Train

In [16]:
from sklearn.linear_model import SGDClassifier

model = SGDClassifier(random_state=37)
model

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=50, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=37, shuffle=True,
       tol=None, verbose=0, warm_start=False)

### Scoring

In [17]:
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GroupKFold

kfold = GroupKFold(n_splits=5)

y_predict = cross_val_predict(model, X_train, y_train,
                              cv=kfold, groups=train["SentenceId"])

print(y_predict.shape)
y_predict

(156060,)


array([3, 2, 2, ..., 2, 2, 2])

In [18]:
from sklearn.metrics import accuracy_score

score = accuracy_score(y_train, y_predict)

print("Score = {0:0.6f}".format(score))

Score = 0.577771


In [19]:
import numpy as np

result = train.copy()
result["Sentiment(predict)"] = y_predict

result["Distance"] = result["Sentiment"] - result["Sentiment(predict)"]
result["Distance"] = np.abs(result["Distance"])

result = result.sort_values(by="Distance", ascending=False)
result.head()

Unnamed: 0_level_0,SentenceId,Phrase,Sentiment,Phrase(Origin),Sentiment(predict),Distance
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
54390,2703,Half Past Dead be just such achievement .,4,Half Past Dead is just such an achievement .,0,4
65741,3333,"it 's loathsom movie , it really be it make ab...",0,"It 's a loathsome movie , it really is and it ...",4,4
64316,3255,even stuffiest cinema goer will laugh their \*...,4,even the stuffiest cinema goers will laugh the...,0,4
64323,3255,will laugh their \*\*\* off for hour-and-a-half,4,will laugh their \*\*\* off for an hour-and-a-...,0,4
90343,4702,"origin idea -LRB- role , edit , score , anythi...",0,"nary an original idea -LRB- or role , or edit ...",4,4


In [20]:
# result[result["Phrase"].str.contains("can't")]

In [21]:
result[0:10000].to_csv("result.csv")

In [22]:
pd.DataFrame(vocabulary).to_csv("vocabulary.csv")

### Predict

In [23]:
model.fit(X_train, y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=50, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=37, shuffle=True,
       tol=None, verbose=0, warm_start=False)

In [24]:
predictions = model.predict(X_test)

print(predictions.shape)
predictions

(66292,)


array([3, 3, 2, ..., 2, 2, 2])

## Submit

In [25]:
submission = pd.read_csv("input/sampleSubmission.csv", index_col="PhraseId")

print(submission.shape)
submission.head()

(66292, 1)


Unnamed: 0_level_0,Sentiment
PhraseId,Unnamed: 1_level_1
156061,2
156062,2
156063,2
156064,2
156065,2


In [26]:
submission["Sentiment"] = predictions

print(submission.shape)
submission.head()

(66292, 1)


Unnamed: 0_level_0,Sentiment
PhraseId,Unnamed: 1_level_1
156061,3
156062,3
156063,2
156064,3
156065,2


In [27]:
submission.to_csv("baseline-script.csv")