In [2]:
from transformers import pipeline, BertTokenizer
from IPython.display import clear_output
from sklearn.metrics import confusion_matrix, balanced_accuracy_score, accuracy_score
import seaborn as sns
import os
import pandas as pd
import numpy as np
import torch
from time import time
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score

extractor = pipeline("feature-extraction", model="distilbert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("distilbert-base-uncased")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called f

In [3]:
splits = 200

def get_embedding(review):
    tokens = review.split()
    while len(tokenizer.tokenize(" ".join(tokens))) > 510:
        tokens = tokens[1:]
    tokens = " ".join(tokens)
    return extractor.transform(tokens)[0][-1]

def make_embedding(df, add_rating):
    dfs = []
    for id, indices in enumerate(np.array_split(df.index.values, splits)):
        temp = df.loc[indices]
        temp["embedding"] = temp.review.apply(get_embedding)
        out_df = pd.DataFrame(torch.tensor(temp.embedding.values.tolist()), index=temp.index)
        if add_rating:
            out_df["target"] = df["rating"]
        dfs.append(out_df)
        print(str(id + 1) + " out of: " + str(splits))
    output = pd.concat(dfs, axis=0)
    return output

In [4]:
train_data = pd.read_csv("train_data.csv")
test_data = pd.read_csv("test_data.csv", header=None)
test_data.columns = ["review"]

In [5]:
train_data

Unnamed: 0,review,rating
0,location not palace excellent hotel booke dthe...,4
1,respite definitely not place stay looking ultr...,3
2,stunning truly memorable spot right beach nusa...,4
3,solid business hotel near embassy stayed hotel...,3
4,nice place make sure lock money warning money ...,3
...,...,...
16387,great base explore new york stayed 4 nights en...,4
16388,wonderful advert paris wonderful introduction ...,4
16389,ideal relaxing holdiay rachel jay green liverp...,3
16390,"watch food, husband went resort 4 nights chris...",2


In [6]:
test_data

Unnamed: 0,review
0,great hotel location stayed 4 nts 24th 28th ja...
1,"n't return overall disappointed hotel, no hot ..."
2,great value location desired problem hotel loc...
3,kind helpfull people people kind helpful.we no...
4,absolutely fabulous melia comfortable star hot...
...,...
4094,cockroaches dirty carpeting not consider 10 de...
4095,"ca n't wait return, husband stayed el san juan..."
4096,coming home stay wind chimes inn like coming h...
4097,good hotel great location stayed apsis splendi...


In [7]:
#train_data_embedded = make_embedding(train_data, True)
#train_data_embedded.to_csv(os.path.join(os.getcwd(), "train_embedded.csv"),)
train_data_embedded = pd.read_csv("train_embedded.csv", index_col=0)

In [8]:
#test_data_embedded = make_embedding(test_data, False)
#test_data_embedded.to_csv(os.path.join(os.getcwd(), "test_embedded.csv"),)
test_data_embedded = pd.read_csv("test_embedded.csv", index_col=0)

In [9]:
train_data_embedded

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,759,760,761,762,763,764,765,766,767,target
0,0.731024,0.250655,-0.089744,0.520189,-0.351825,-0.796953,0.368891,-0.485046,0.544405,0.012710,...,-0.248937,-0.329662,-1.336018,0.294242,-1.056268,-0.317274,-0.079945,-0.374467,-0.185057,4
1,0.852949,0.236835,0.063832,0.469964,-0.363206,-1.045099,0.504673,-0.323598,0.351677,-0.058628,...,-0.053854,-0.379581,-1.176143,0.283369,-0.969518,-0.676686,-0.048067,-0.012696,-0.413625,3
2,0.685062,0.223866,-0.147828,0.606955,-0.273633,-1.010276,0.427389,-0.193593,0.612398,-0.289408,...,-0.314359,-0.234852,-0.970357,0.102805,-0.914143,-0.388480,-0.169932,-0.066460,-0.435708,4
3,0.604577,0.156163,0.297397,0.451490,-0.491574,-0.985112,0.442942,-0.197288,0.268713,-0.115580,...,-0.330194,-0.097302,-0.903164,0.193958,-0.756460,-0.614573,-0.158738,0.176252,-0.584963,3
4,0.880773,0.142636,-0.071651,0.651384,-0.365893,-1.035383,0.384419,-0.315585,0.555099,-0.049492,...,-0.309621,-0.255526,-0.991006,0.148594,-0.816133,-0.232153,-0.065063,-0.188306,-0.234032,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16387,0.835931,0.129018,0.022031,0.611350,-0.468871,-0.967782,0.495816,-0.377074,0.321901,-0.078694,...,-0.251063,-0.303376,-1.200032,0.222648,-0.893699,-0.576341,-0.013637,-0.038671,-0.373459,4
16388,0.940912,0.275095,0.001012,0.571690,-0.366752,-0.934709,0.502089,-0.325906,0.531427,-0.064609,...,-0.124844,-0.253585,-1.128829,0.364104,-0.857907,-0.352951,-0.022036,-0.106580,-0.261812,4
16389,0.777620,0.096441,-0.056846,0.591733,-0.312893,-1.033518,0.436032,-0.263038,0.386873,0.010131,...,-0.326421,-0.174483,-1.163905,0.222434,-0.881252,-0.462759,0.088253,0.021210,-0.329125,3
16390,0.732554,0.293466,-0.137129,0.726583,-0.281647,-0.734680,0.368102,-0.346282,0.417760,-0.047382,...,-0.211326,-0.092348,-1.141169,0.191584,-0.932992,-0.262553,0.038973,-0.071927,-0.228940,2


In [10]:
test_data_embedded

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,0.938493,0.183004,0.063817,0.606710,-0.451647,-1.035825,0.497661,-0.335741,0.296220,-0.089404,...,0.320153,-0.215700,-0.141498,-1.083900,0.211511,-0.850142,-0.530651,-0.013141,-0.012572,-0.348831
1,0.706570,0.297040,0.028349,0.577732,-0.434663,-0.822771,0.561691,-0.450103,0.477983,0.191853,...,0.391724,-0.174689,-0.279862,-1.175929,0.155675,-1.090621,-0.480290,-0.028343,-0.080079,-0.208822
2,0.641337,0.111375,-0.158317,0.652595,-0.282199,-0.709899,0.479667,-0.350484,0.658158,-0.103099,...,0.460593,-0.204626,-0.182919,-1.175709,0.084220,-1.226739,-0.539408,-0.012542,-0.196584,-0.291981
3,0.795235,0.288830,0.081681,0.478825,-0.349272,-0.951365,0.739767,0.119375,0.262919,-0.324973,...,0.511579,-0.089489,-0.287653,-0.946852,0.157680,-0.753880,-0.655168,-0.502852,0.179739,-0.493145
4,0.878483,0.168300,0.053744,0.619344,-0.258921,-0.971106,0.514012,-0.315954,0.336959,-0.046220,...,0.448769,-0.251376,-0.189057,-1.224119,0.270882,-0.938721,-0.318829,-0.003921,-0.071376,-0.336138
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4094,0.691393,0.205510,0.025593,0.584929,-0.429532,-0.889108,0.373620,-0.327372,0.362951,-0.157241,...,0.571003,-0.216551,-0.152186,-1.159467,0.071518,-0.893078,-0.720746,-0.094627,0.159722,-0.414767
4095,0.785465,0.201247,0.059720,0.534150,-0.293193,-1.000153,0.477972,-0.359672,0.261462,0.010799,...,0.269770,-0.172465,-0.264479,-1.016061,0.078172,-0.816504,-0.544462,-0.045216,-0.075585,-0.418826
4096,0.778980,0.311188,-0.053091,0.542757,-0.259970,-1.140321,0.566421,-0.276461,0.482846,-0.077032,...,0.432233,-0.255068,-0.210363,-1.114083,0.263386,-0.951518,-0.053777,-0.038128,-0.146547,-0.352711
4097,0.903361,0.103976,-0.061863,0.502919,-0.379310,-0.841914,0.578198,-0.225339,0.459133,-0.083272,...,0.476695,-0.134333,-0.282957,-0.949641,0.156362,-0.719047,-0.611670,0.055877,-0.075826,-0.399337


In [33]:
from sklearn.model_selection import train_test_split
complete_set = train_data_embedded
X = complete_set.loc[:, complete_set.columns != 'target']
y = complete_set['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [46]:
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

model = make_pipeline(StandardScaler(), SVC(class_weight='balanced'))
model.fit(X_train, y_train)

In [47]:
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

print(f"TRAIN:\n balanced_accuracy = {balanced_accuracy_score(y_train, y_pred_train)}, accuracy = {accuracy_score(y_train, y_pred_train)}\n f1_score: {f1_score(y_train, y_pred_train, average='weighted')}")
print(f"TEST: \n balanced_accuracy = {balanced_accuracy_score(y_test, y_pred_test)}, accuracy = {accuracy_score(y_test, y_pred_test)}\n f1_score: {f1_score(y_test, y_pred_test, average='weighted')}")

TRAIN:
 balanced_accuracy = 0.8071860558243801, accuracy = 0.7414016624723557
 f1_score: 0.7430589646134262
TEST: 
 balanced_accuracy = 0.5098842515073029, accuracy = 0.5468130527599878
 f1_score: 0.5576391365979276


In [19]:
predictions = model.predict(test_data_embedded)
pd.Series(predictions).to_csv("piatek_Brus_Maj.csv", index=None, header=None)