In [1]:
import numpy as np
import pandas as pd
import nltk

pd.set_option('display.max_columns', 500)

# Przetwarzanie zbioru danych

Wczytanie zbioru danych

In [2]:
df = pd.read_csv("reviews_train.csv", nrows=100)
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,summary,unixReviewTime,reviewTime,score
0,A35C43YE9HU9CN,B0064X7B4A,Joan Miller,"[0, 0]",I have decided not to play this game. I can't...,Friends,1396396800,"04 2, 2014",1.0
1,AHFS8CGWWXB5B,B00H1P4V3E,WASH ST. GAMER,"[3, 4]",The Amazon Appstore free app of the day for Ju...,"Amazon Makes This ""Longest Spring Ever"" for Fi...",1402272000,"06 9, 2014",2.0
2,A3EW8OTQ90NVHM,B00CLVW82O,Kindle Customer,"[0, 4]",this game was so mush fun I wish I could play ...,best,1368921600,"05 19, 2013",5.0
3,AJ3GHFJY1IUTD,B007T9WVKM,BrawlMaster4,"[0, 2]","Its pretty fun and very good looking, but you...",Fun Game,1350172800,"10 14, 2012",5.0
4,A3JJGBS4EL603S,B00J206J5E,"K. Wilson ""thesupe""","[0, 0]",good graphics; immersive storyline; hard to st...,great game!,1396915200,"04 8, 2014",5.0


Wstępne przetwarzanie danych

In [3]:
import json
from sklearn.preprocessing import LabelEncoder

# df['helpful'] = df['helpful'].str.replace("[", "")
# df['helpful'] = df['helpful'].str.replace("]", "")
# df[['helpfulP', 'helpfulN']] = df['helpful'].str.split(',', 1, expand=True)
# df['helpfulP'] = pd.to_numeric(df['helpfulP'])
# df['helpfulN'] = pd.to_numeric(df['helpfulN'])
# df = df.drop('helpful', axis=1)
# df.head()

# df['helpful'] = df['helpful'].str.replace("[", "")
# df['helpful'] = df['helpful'].str.replace("]", "")
# df[['helpfulP', 'helpfulN']] = df['helpful'].str.split(',', 1, expand=True)
# df['helpful'] = df['helpful'].astype(object)

# Wczytanie zmiana typu kolumny na taki użyteczny dla klasyfikatora.
for i, val in df.iterrows():
    val['helpful'] = json.loads(val['helpful'])

# Odrzucanie niepełnych kolumn, ponieważ są one nieznaczną częścią pełnego zbioru.
print(len(df) - len(df.dropna()))
df = df.dropna()

# Kodowanie cech tekstowych na liczbowe
encoders = {}
for col in ['reviewerID','asin','reviewerName', 'reviewTime']:
    enc = LabelEncoder().fit(df[col])
    df[col] = enc.transform(df[col])
    encoders.update({col:enc})

# Odrzucenie cechy reviewTime, ponieważ oznacza ona to samo co unixReviewTime a jest mniej praktyczna
df.drop("reviewTime", axis=1, inplace=True)

df.head()

0


Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,summary,unixReviewTime,score
0,48,24,30,"[0, 0]",I have decided not to play this game. I can't...,Friends,1396396800,1.0
1,83,93,73,"[3, 4]",The Amazon Appstore free app of the day for Ju...,"Amazon Makes This ""Longest Spring Ever"" for Fi...",1402272000,2.0
2,53,75,39,"[0, 4]",this game was so mush fun I wish I could play ...,best,1368921600,5.0
3,86,41,9,"[0, 2]","Its pretty fun and very good looking, but you...",Fun Game,1350172800,5.0
4,59,94,34,"[0, 0]",good graphics; immersive storyline; hard to st...,great game!,1396915200,5.0


Podstawowe informacje o zbiorze

In [4]:
print("---Dataset description---\n", df.describe())
print( "---Correlation Matrix---\n", df.corr())

---Dataset description---
        reviewerID        asin  reviewerName  unixReviewTime       score
count  100.000000  100.000000     100.00000    1.000000e+02  100.000000
mean    48.590000   46.480000      49.37000    1.364138e+09    4.010000
std     28.869996   27.823571      28.81591    2.749736e+07    1.381662
min      0.000000    0.000000       0.00000    1.306368e+09    1.000000
25%     23.750000   22.750000      24.75000    1.338379e+09    3.000000
50%     48.500000   45.500000      49.50000    1.368360e+09    5.000000
75%     73.250000   70.250000      74.25000    1.388642e+09    5.000000
max     98.000000   95.000000      98.00000    1.405987e+09    5.000000
---Correlation Matrix---
                 reviewerID      asin  reviewerName  unixReviewTime     score
reviewerID        1.000000 -0.170206     -0.029381       -0.248989  0.119629
asin             -0.170206  1.000000     -0.125277        0.636796 -0.036912
reviewerName     -0.029381 -0.125277      1.000000       -0.145153 -

# Przetwarzanie języka

Tokenizacja oraz stemming

In [5]:
def get_stems(word: str):
    tokens = nltk.word_tokenize(word)

    porter = nltk.PorterStemmer()
    for i in range(len(tokens)):
        tokens[i] = porter.stem(tokens[i])

    return tokens

from collections import Counter
import re

words = Counter()
for i in df['reviewText'].index:
    words.update(get_stems(df['reviewText'][i]))

# words.most_common(50)

[('.', 225),
 ('it', 152),
 ('the', 133),
 ('i', 129),
 ('to', 112),
 (',', 105),
 ('and', 93),
 ('thi', 85),
 ('game', 80),
 ('!', 80),
 ('a', 75),
 ('is', 67),
 ('for', 62),
 ('you', 52),
 ('of', 48),
 ('play', 39),
 ('get', 38),
 ('have', 37),
 ('app', 37),
 ('on', 37),
 ('not', 36),
 ('my', 35),
 ('but', 33),
 (';', 32),
 ('so', 31),
 ('fun', 29),
 ('with', 28),
 ('that', 25),
 ('love', 25),
 ('&', 25),
 ('one', 23),
 ("n't", 22),
 ('do', 22),
 ('fire', 21),
 ("'s", 21),
 ('in', 20),
 ('great', 19),
 ('work', 19),
 ('u', 18),
 ('if', 18),
 ('are', 17),
 ('like', 17),
 ('time', 17),
 ('or', 15),
 ('other', 15),
 ('your', 15),
 ('kindl', 15),
 ('me', 15),
 ('would', 15),
 ('as', 14)]

Przetwarzanie uzyskanych stemmów.

In [6]:
for w in list(words):
    if re.search("\W", w) is not None:
        del words[w]

# words.most_common(50)

# nltk.corpus.stopwords.words('english')
### Rozważyć wywalanie not
stopwords = ["a", "about", "after", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been",
             "before", "being", "between", "both", "by", "could", "did", "do", "does", "doing", "during", "each",
             "for", "from", "further", "had", "has", "have", "having", "he", "her", "here", "hers", "herself", "him",
             "himself", "his", "how", "i", "in", "into", "is", "it", "its", "itself", "let", "me", "more", "most", "my",
             "myself", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "own", "sha",
             "she", "should", "so", "some", "such", "than", "that", "the", "their", "theirs", "them", "themselves",
             "then", "there", "there's", "these", "they", "this", "those", "through", "to", "until", "up", "very",
             "was", "we", "were", "what", "when", "where", "which", "while", "who", "whom", "with", "would", "you",
             "your", "yours", "yourself", "yourselves",
             "n't", "'s", "'ll", "'re", "'d", "'m", "'ve",
             "above", "again", "against", "below", "but", "cannot", "down", "few", "if", "no", "nor", "not", "off",
             "out", "over", "same", "too", "under", "why"]

for w in stopwords:
    if w in words:
        del words[w]

# words.most_common(50)

# wywalanie najrzadszych tokenów
max_words = 50
for i in list(range(len(words)))[max_words:]:
    del words[i]

# words.most_common(50)

Tworzenie wektora Bag Of Words

In [7]:
from scipy.sparse import csr_matrix

#BOW
# def create_bow(documents, features):
#     row = []
#     col = []
#     data = []
#
#     labels = []
#
#     for i in documents.index:
#         tweet_tokens = get_stems(documents[i])
#
#         labels.append(label)
#         for token in set(tweet_tokens):
#             if token not in features:
#                 continue
#             row.append(i)
#             col.append(features[token])
#             data.append(1)
#     return csr_matrix((data, (row, col)), shape=(len(documents), len(features))), labels

min_word_count = 5

common_words = list([k for k, v in words.most_common() if v > min_word_count])

feature_dict = {}
for word in common_words:
    feature_dict[word] = len(feature_dict)

# X_train, y_train = create_bow(train_tweets, feature_dict)

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features=10)

df['reviewText'] = vectorizer.fit_transform(df['reviewText']).toarray().tolist()
df['summary'] = vectorizer.fit_transform(df['summary']).toarray().tolist()

df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,summary,unixReviewTime,score
0,48,24,30,"[0, 0]","[0, 0, 1, 0, 1, 1, 0, 1, 2, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",1396396800,1.0
1,83,93,73,"[3, 4]","[7, 6, 3, 3, 5, 6, 18, 10, 9, 0]","[1, 1, 0, 0, 0, 0, 0, 1, 2, 0]",1402272000,2.0
2,53,75,39,"[0, 4]","[0, 1, 1, 0, 1, 0, 0, 1, 1, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",1368921600,5.0
3,86,41,9,"[0, 2]","[1, 1, 1, 0, 0, 0, 0, 0, 1, 1]","[0, 0, 1, 1, 0, 0, 0, 0, 0, 0]",1350172800,5.0
4,59,94,34,"[0, 0]","[1, 1, 0, 0, 0, 0, 0, 0, 2, 1]","[0, 0, 0, 1, 1, 0, 0, 0, 0, 0]",1396915200,5.0


# Uczenie modelu

In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier

y = df["score"]
X = df[list(set(df.columns) - {"score"})]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

clf = RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=23)
clf.fit(X_train, y_train)

# Tutaj hiperparametry
# grid = GridSearchCV()

Rezultaty klasyfikacji i porównania z algorytmem większościowym oraz losowym.

In [None]:
import random
from sklearn.metrics import f1_score, precision_score, recall_score

list_of_labels = y_train.unique()


def scores(y_true, y_pred, name=None):
    print(f"=================== Results: {name} ===================")
    print(f"            {list_of_labels}   ")
    print("F1       ", f1_score(y_true, y_pred, average=None, pos_label=None, labels=list_of_labels))
    print("Precision", precision_score(y_true, y_pred, average=None, pos_label=None, labels=list_of_labels))
    print("Recall   ", recall_score(y_true, y_pred, average=None, pos_label=None, labels=list_of_labels))


y_pred_random, y_pred_majority = [],[]
majority_label = y_train.value_counts().index[0]

for _ in range(len(X_test)):
    y_pred_random.append(random.choice(list_of_labels))
    y_pred_majority.append(majority_label)

scores(y_test, y_pred_random, "RandomClassifier")
scores(y_test, y_pred_majority, "MajorityClassifier")
scores(y_test, clf.predict(X_test), "Trained Classifier")

            [4. 3. 5. 1. 2.]   
F1        [0.18181818 0.22222222 0.32       0.16666667 0.        ]
Precision [0.14285714 0.25       0.66666667 0.11111111 0.        ]
Recall    [0.25       0.2        0.21052632 0.33333333 0.        ]
            [4. 3. 5. 1. 2.]   
ERROR! Session/line number was not unique in database. History logging moved to new session 51
F1        [0.         0.         0.73076923 0.         0.        ]
Precision [0.         0.         0.57575758 0.         0.        ]
Recall    [0. 0. 1. 0. 0.]


  _warn_prf(average, modifier, msg_start, len(result))
