In [13]:
import pandas as pd

### Load Dataset

In [14]:
train = pd.read_csv("data/train.tsv", sep="\t", index_col="PhraseId")

print(train.shape)
train.head()

(156060, 3)


Unnamed: 0_level_0,SentenceId,Phrase,Sentiment
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,A series of escapades demonstrating the adage ...,1
2,1,A series of escapades demonstrating the adage ...,2
3,1,A series,2
4,1,A,2
5,1,series,2


In [15]:
test = pd.read_csv("data/test.tsv", sep="\t", index_col="PhraseId")

print(test.shape)
test.head()

(66292, 2)


Unnamed: 0_level_0,SentenceId,Phrase
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
156061,8545,An intermittently pleasing but mostly routine ...
156062,8545,An intermittently pleasing but mostly routine ...
156063,8545,An
156064,8545,intermittently pleasing but mostly routine effort
156065,8545,intermittently pleasing but mostly routine


## Preprocessing

In [16]:
train["Phrase(origin)"] = train["Phrase"].copy()

print(train.shape)
train[["Phrase", "Phrase(origin)"]].head()

(156060, 4)


Unnamed: 0_level_0,Phrase,Phrase(origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
2,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
3,A series,A series
4,A,A
5,series,series


In [17]:
test["Phrase(origin)"] = test["Phrase"].copy()

print(test.shape)
test[["Phrase", "Phrase(origin)"]].head()

(66292, 3)


Unnamed: 0_level_0,Phrase,Phrase(origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
156061,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156062,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156063,An,An
156064,intermittently pleasing but mostly routine effort,intermittently pleasing but mostly routine effort
156065,intermittently pleasing but mostly routine,intermittently pleasing but mostly routine


### Clean Text

In [18]:
import re

def clean_text(phrase):
    phrase = phrase.replace("doesn't ", "does not ")
    phrase = phrase.replace("ca n't ", "can not ")
    phrase = phrase.replace(" n't ", " not ")
    phrase = re.sub(r"[0-9]","",phrase)
    
    return phrase

train["Phrase"] = train["Phrase"].apply(clean_text)

print(train.shape)
train[["Phrase", "Phrase(origin)"]].head()

(156060, 4)


Unnamed: 0_level_0,Phrase,Phrase(origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
2,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
3,A series,A series
4,A,A
5,series,series


In [19]:
test["Phrase"] = test["Phrase"].apply(clean_text)

print(test.shape)
test[["Phrase", "Phrase(origin)"]].head()

(66292, 3)


Unnamed: 0_level_0,Phrase,Phrase(origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
156061,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156062,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156063,An,An
156064,intermittently pleasing but mostly routine effort,intermittently pleasing but mostly routine effort
156065,intermittently pleasing but mostly routine,intermittently pleasing but mostly routine


### Stem phrases

In [20]:
from tqdm import tqdm
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer('english')

def stem_phrase(phrase1):
    stemmed_words = [stemmer.stem(w) for w in phrase1.split(" ")]
    stemmed_phrase = " ".join(stemmed_words)

    return stemmed_phrase

tqdm.pandas(desc="Stemming...")
train["Phrase"].progress_apply(stem_phrase).head()

print(train.shape)
train[["Phrase", "Phrase(origin)"]].head()

Stemming...: 100%|██████████| 156060/156060 [00:24<00:00, 6326.05it/s]


(156060, 4)


Unnamed: 0_level_0,Phrase,Phrase(origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
2,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
3,A series,A series
4,A,A
5,series,series


In [21]:
tqdm.pandas(desc="Stemming...")
test["Phrase"].progress_apply(stem_phrase).head()

print(test.shape)
test[["Phrase", "Phrase(origin)"]].head()

Stemming...: 100%|██████████| 66292/66292 [00:09<00:00, 7282.02it/s]


(66292, 3)


Unnamed: 0_level_0,Phrase,Phrase(origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
156061,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156062,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156063,An,An
156064,intermittently pleasing but mostly routine effort,intermittently pleasing but mostly routine effort
156065,intermittently pleasing but mostly routine,intermittently pleasing but mostly routine


### Negative,Netural,Positive

In [28]:
#Sentiment Analysis
negative_train = train[train["Sentiment"] < 2]
netural_train = train.loc[train.Sentiment == 2]
positive_train = train.loc[train.Sentiment > 2]

In [29]:
# negative vocabluer 구함
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
# binary=True/False
# lowercase=True/False
# ngram_range=(1, 1)
# stop_words=None

# vectorizer = CountVectorizer(max_features=1000)
#stop_vectorizer = CountVectorizer(ngram_range=(1,3),max_df=0.5)
stop_vectorizer = TfidfVectorizer(analyzer='word', max_features=30000, ngram_range=(1, 3))

stop_vectorizer.fit(negative_train["Phrase"])
stop_vectorizer.transform(negative_train["Phrase"])

stop_words = stop_vectorizer.get_feature_names()

In [30]:
stop_words

['aaliyah',
 'aaliyah in',
 'aaliyah in her',
 'abandon',
 'abandon the',
 'abandon the theater',
 'abc',
 'abhorrent',
 'ability',
 'ability to',
 'ability to think',
 'able',
 'able project',
 'able to',
 'able to muster',
 'abomination',
 'abomination mean',
 'abomination mean alabama',
 'aborted',
 'aborted attempts',
 'about',
 'about an',
 'about and',
 'about and is',
 'about any',
 'about any aspect',
 'about as',
 'about as convincing',
 'about as if',
 'about bad',
 'about bad cinema',
 'about being',
 'about cinema',
 'about cinema only',
 'about crass',
 'about crass jaded',
 'about crime',
 'about every',
 'about every cliche',
 'about god',
 'about god is',
 'about growing',
 'about growing up',
 'about his',
 'about how',
 'about how lame',
 'about human',
 'about human infidelity',
 'about ignoring',
 'about ignoring what',
 'about it',
 'about it by',
 'about its',
 'about its titular',
 'about making',
 'about making movie',
 'about mary',
 'about minutes',
 'about mi

In [31]:
## 2 ~3 
#netu_vectorizer = CountVectorizer(ngram_range=(1,3),max_df=0.5)
netu_vectorizer = TfidfVectorizer(analyzer='word', max_features=30000, ngram_range=(1, 3))

netu_vectorizer.fit(netural_train["Phrase"])
netu_vectorizer.transform(netural_train["Phrase"])

netu_words = netu_vectorizer.get_feature_names()

In [32]:
netu_words

['aaliyah',
 'abagnale',
 'abagnale antics',
 'abandon',
 'abbott',
 'abc',
 'abdul',
 'abdul malik',
 'abdul malik abbott',
 'abel',
 'abhors',
 'ability',
 'ability to',
 'ability to bore',
 'abject',
 'abject suffering',
 'able',
 'able to',
 'able to muster',
 'ably',
 'abound',
 'about',
 'about all',
 'about all the',
 'about an',
 'about an inhuman',
 'about anything',
 'about artifice',
 'about artifice and',
 'about as',
 'about bad',
 'about bad company',
 'about being',
 'about being stupid',
 'about black',
 'about black urban',
 'about cal',
 'about campus',
 'about campus depravity',
 'about chicago',
 'about chicago in',
 'about contract',
 'about contract on',
 'about critical',
 'about critical reaction',
 'about documentaries',
 'about documentaries in',
 'about drug',
 'about drug dealers',
 'about entrapment',
 'about entrapment in',
 'about eve',
 'about everything',
 'about existential',
 'about existential suffering',
 'about family',
 'about fear',
 'about fear 

In [33]:
#stops world 리스트에서 중복제거하며 합치기
stop_words = list(set(stop_words).union(set(netu_words)))

In [34]:
## 2 ~3
#posi_vectorizer = CountVectorizer(ngram_range=(1,3),max_df=0.5)
posi_vectorizer = TfidfVectorizer(analyzer='word', max_features=30000, ngram_range=(1, 3))
posi_vectorizer.fit(positive_train["Phrase"])
posi_vectorizer.transform(positive_train["Phrase"])

<42133x30000 sparse matrix of type '<class 'numpy.float64'>'
	with 585969 stored elements in Compressed Sparse Row format>

In [35]:
posi_words = posi_vectorizer.get_feature_names()

In [36]:
#stops world 리스트에서 중복제거하며 합치기
vocabulary = list(set(stop_words).union(set(posi_words)))

### Vectorize phrases

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer

char_vectorizer = TfidfVectorizer(analyzer='char', max_features=10000, ngram_range=(1, 9))
char_vectorizer

TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=1,
        ngram_range=(1, 9), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [38]:
char_vectorizer.fit(train["Phrase"])

TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=1,
        ngram_range=(1, 9), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [39]:
X_train_char = char_vectorizer.transform(train["Phrase"])
X_train_char

<156060x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 20097469 stored elements in Compressed Sparse Row format>

In [40]:
X_test_char = char_vectorizer.transform(test["Phrase"])
X_test_char

<66292x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 7898338 stored elements in Compressed Sparse Row format>

In [48]:
#word_vectorizer = TfidfVectorizer(analyzer='word', max_features=30000, ngram_range=(1, 2))
word_vectorizer = TfidfVectorizer(analyzer='word', max_features=30000, ngram_range=(1, 3),vocabulary=vocabulary)
word_vectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=30000, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=['at the door', 'insulting or', 'surfer', 'flowering of the', 'encumbers itself', 'been fumbled by', 'style parody blaxploitation', 'drunk love', 'of jonathan', 'obvious melodrama and', 'however it', 'brilliant and', 'substance it so', 'and the basic', 'move', 'shenanigans in welcome', 'p...'unresolved moral conflict', 'roller coaster', 'mundane', 'industry in', 'sand creeping', 'is mere'])

In [49]:
word_vectorizer.fit(train["Phrase"])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=30000, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=['at the door', 'insulting or', 'surfer', 'flowering of the', 'encumbers itself', 'been fumbled by', 'style parody blaxploitation', 'drunk love', 'of jonathan', 'obvious melodrama and', 'however it', 'brilliant and', 'substance it so', 'and the basic', 'move', 'shenanigans in welcome', 'p...'unresolved moral conflict', 'roller coaster', 'mundane', 'industry in', 'sand creeping', 'is mere'])

In [50]:
X_train_word = word_vectorizer.transform(train["Phrase"])
X_train_word

<156060x69810 sparse matrix of type '<class 'numpy.float64'>'
	with 1868523 stored elements in Compressed Sparse Row format>

In [51]:
X_test_word = word_vectorizer.transform(test["Phrase"])
X_test_word

<66292x69810 sparse matrix of type '<class 'numpy.float64'>'
	with 501233 stored elements in Compressed Sparse Row format>

In [52]:
from scipy.sparse import hstack

X_train = hstack([X_train_char, X_train_word])
X_train

<156060x79810 sparse matrix of type '<class 'numpy.float64'>'
	with 21965992 stored elements in COOrdinate format>

In [53]:
X_test = hstack([X_test_char, X_test_word])
X_test

<66292x79810 sparse matrix of type '<class 'numpy.float64'>'
	with 8399571 stored elements in COOrdinate format>

In [20]:
# columns = word_vectorizer.get_feature_names()
# pd.DataFrame(X_train.tocsr()[:100].toarray(), columns=columns).head()

In [54]:
y_train = train["Sentiment"]

print(y_train.shape)
y_train.head()

(156060,)


PhraseId
1    1
2    2
3    2
4    2
5    2
Name: Sentiment, dtype: int64

In [55]:
sentence_ids = train["SentenceId"]

print(sentence_ids.shape)
sentence_ids.head()

(156060,)


PhraseId
1    1
2    1
3    1
4    1
5    1
Name: SentenceId, dtype: int64

## Score

In [56]:
from sklearn.linear_model import SGDClassifier

model = SGDClassifier(alpha=0.000006762746, random_state=37)
model



SGDClassifier(alpha=6.762746e-06, average=False, class_weight=None,
       epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=5, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=37, shuffle=True,
       tol=None, verbose=0, warm_start=False)

In [57]:
# from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GroupKFold

kfold = GroupKFold(n_splits=5)

y_predict = cross_val_predict(model, X_train, y_train,
                              cv=kfold, groups=sentence_ids)

print(y_predict.shape)
y_predict[0:10]

(156060,)


array([1, 2, 2, 2, 2, 3, 2, 3, 2, 3], dtype=int64)

In [58]:
from sklearn.metrics import accuracy_score

score = accuracy_score(y_train, y_predict)
print("Score = {0:.5f}".format(score))

Score = 0.60164


Score = 0.60041

In [59]:
import numpy as np

result = train.copy()
result["Sentiment(predict)"] = y_predict
result["Difference(Phrase)"] = np.abs(y_train - y_predict)

print(result.shape)
result.head()

(156060, 6)


Unnamed: 0_level_0,SentenceId,Phrase,Sentiment,Phrase(origin),Sentiment(predict),Difference(Phrase)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,A series of escapades demonstrating the adage ...,1,A series of escapades demonstrating the adage ...,1,0
2,1,A series of escapades demonstrating the adage ...,2,A series of escapades demonstrating the adage ...,2,0
3,1,A series,2,A series,2,0
4,1,A,2,A,2,0
5,1,series,2,series,2,0


In [60]:
sentiment = result.groupby("SentenceId")["Difference(Phrase)"].mean()
print(sentiment.shape)
sentiment.head()

(8529,)


SentenceId
1    0.238095
2    0.500000
3    0.142857
4    0.400000
5    0.500000
Name: Difference(Phrase), dtype: float64

In [61]:
def find_sentiment(sentence_id):
    return sentiment.loc[sentence_id]

result["Difference(Sentence)"] = result["SentenceId"].apply(find_sentiment)
result = result.sort_values(by="Difference(Sentence)", ascending=False)

print(result.shape)
result.head()

(156060, 7)


Unnamed: 0_level_0,SentenceId,Phrase,Sentiment,Phrase(origin),Sentiment(predict),Difference(Phrase),Difference(Sentence)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
79350,4087,can not recommend it .,0,ca n't recommend it .,4,4,4.0
79349,4087,I can not recommend it .,0,I ca n't recommend it .,4,4,4.0
113297,6020,A real snooze .,0,A real snooze .,3,3,3.0
113298,6020,real snooze .,0,real snooze .,3,3,3.0
18421,807,Execrable .,0,Execrable .,2,2,2.5


In [62]:
result[0:1000].to_csv("result.csv")

In [30]:
# vocabulary = vectorizer.get_feature_names()
# vocabulary[0:3]

In [31]:
# pd.DataFrame(vocabulary, columns=["word"]).to_csv("vocabulary.csv")

In [32]:
# result[result["Phrase"].str.contains("can not recommend")]

## Train

In [63]:
#!pip install xgboost

Collecting xgboost
  Downloading xgboost-0.7.post3.tar.gz (450kB)


No files/directories in C:\Users\user\AppData\Local\Temp\pip-build-a_t6bis_\xgboost\pip-egg-info (from PKG-INFO)


In [64]:
# import xgboost as xgb

# dtrain = xgb.DMatrix(X_train, label=y_train)

# params = {
#     'booster': 'gblinear',
#     'objective': 'multi:softmax',
#     'eval_metric': 'merror',
#     'lambda': 2.186753e-03,
#     'alpha': 1.286904,
#     'lambda_bias': 6.191707e+00,
#     'num_class': 5,
#     'nthread': 8,
#     'silent': 1,
# }

# %time booster = xgb.train(params, dtrain, num_boost_round=98)



Wall time: 1min 59s


In [67]:
# dtest = xgb.DMatrix(X_test.toarray())

# predictions = booster.predict(dtest)


MemoryError: 

In [68]:
model.fit(X_train, y_train)

SGDClassifier(alpha=6.762746e-06, average=False, class_weight=None,
       epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=5, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=37, shuffle=True,
       tol=None, verbose=0, warm_start=False)

In [69]:
X_test

<66292x79810 sparse matrix of type '<class 'numpy.float64'>'
	with 8399571 stored elements in COOrdinate format>

In [70]:
#predictions = model.predict(X_test)
predictions = model.predict(X_test)

print(predictions.shape)
predictions[0:10]

(66292,)


array([3, 3, 2, 3, 3, 3, 3, 2, 3, 2], dtype=int64)

In [71]:
submission = pd.read_csv("data/sampleSubmission.csv", index_col="PhraseId")

submission["Sentiment"] = predictions

print(submission.shape)
submission.head()

(66292, 1)


Unnamed: 0_level_0,Sentiment
PhraseId,Unnamed: 1_level_1
156061,3
156062,3
156063,2
156064,3
156065,3


## Submit

In [None]:
submission = pd.read_csv("data/sampleSubmission.csv", index_col="PhraseId")

submission["Sentiment"] = predictions.astype('int')

print(submission.shape)
submission.head()

In [72]:
# 경로(ex: baseline-script.csv)는 사용자 설정마다 다름
#submission.to_csv("use-xgboost.csv")
# 경로(ex: baseline-script.csv)는 사용자 설정마다 다름
filename = "use-xgboost_{score}.csv".format(score="{0:.5f}".format(score))
submission.to_csv(filename)