# Transfer Learning



## Libraries & Data

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import os
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso, Ridge
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import pickle
from sklearn.dummy import DummyRegressor
from nltk.stem import PorterStemmer
from sklearn.linear_model import LassoCV
from scipy.stats import kendalltau

Download nltk package

In [None]:
nltk.download('all')

Kendall accuracy function

In [3]:
def kendall_acc(x, y, percentage=True):
    tau, _ = kendalltau(x, y)
    kt_acc = 0.5 + tau / 2
    n = len(x)
    kt_se = np.sqrt((kt_acc * (1 - kt_acc)) / n)
    lower = kt_acc - 1.96 * kt_se
    upper = kt_acc + 1.96 * kt_se
    report = pd.DataFrame({
        "acc": [kt_acc],
        "lower": [lower],
        "upper": [upper]
    }).round(4)
    if percentage:
        report *= 100
    return report

Review text processing funtion

In [4]:
# clean useless content in the text
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+[^\s]*', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    return text.strip()

# stem the words in the text
ps = PorterStemmer()
def stem_text(text):
    words = word_tokenize(text)
    stemmed_words = [ps.stem(word) for word in words]
    return " ".join(stemmed_words)

Load Users Review


In [5]:
user_df = pd.read_csv("./user_df_filtered_nplus.csv")

# clean review and stem
user_df["Clean_Review"] = user_df["Review"].apply(clean_text)
user_df["Clean_Review"] = user_df["Clean_Review"].apply(stem_text)
user_df.head()

Unnamed: 0.1,Unnamed: 0,Game Name,Review,Rating,Clean_Review
0,0,Wolfenstein: The Old Blood,Decided to play some culturally and historical...,70,decid to play some cultur and histor relev med...
1,1,Wolfenstein: The Old Blood,if you are very hungry for more neo-wolfenstei...,70,if you are veri hungri for more neowolfenstein...
2,2,Wolfenstein: The Old Blood,Slightly less fun version of the New Order,60,slightli less fun version of the new order
3,3,Wolfenstein: The Old Blood,Part 1 is a tad of a slog by comparison with p...,70,part is a tad of a slog by comparison with par...
4,4,Wolfenstein: The Old Blood,This was a relatively short game. I beat the g...,60,thi wa a rel short game i beat the game on ube...


## Apply users model to users review
To draw the coefficient plot on users model

In [6]:
# split date to test and train set
X_temp, _, y_train, y_test = train_test_split(user_df[["Clean_Review"]],
                                              user_df["Rating"],
                                              test_size=0.2,
                                              random_state=42)
scaler_y = MinMaxScaler()
y_train_normalized = scaler_y.fit_transform(y_train.values.reshape(-1, 1))
y_test_normalized = scaler_y.transform(y_test.values.reshape(-1, 1))

In [7]:
# get tfidf features
tfidf_vectorizer = TfidfVectorizer(max_features=5000, 
                                   ngram_range=(1, 3), 
                                   stop_words='english', 
                                   min_df=5, 
                                   max_df=0.95)
X_tfidf_train = tfidf_vectorizer.fit_transform(X_temp["Clean_Review"])
X_tfidf_test = tfidf_vectorizer.transform(user_df.loc[y_test.index, "Clean_Review"])

In [8]:
# get ngram features
ngram_vectorizer = CountVectorizer(max_features=5000, 
                                   ngram_range=(1, 3), 
                                   stop_words='english', 
                                   min_df=5, 
                                   max_df=0.95)
X_ngrams_train = ngram_vectorizer.fit_transform(X_temp["Clean_Review"])
X_ngrams_test = ngram_vectorizer.transform(user_df.loc[y_test.index, "Clean_Review"])

In [9]:
# load lasso model
lasso = pickle.load(open('lasso_model_user.sav', 'rb'))

# get the feature names
feature_names_tfidf = tfidf_vectorizer.get_feature_names_out()
feature_names_ngrams = ngram_vectorizer.get_feature_names_out()

# get important features (with nonzero coefficient)
important_features_users = []
for idx, coef in enumerate(lasso.coef_):
    if abs(coef) > 0:
        if idx < len(feature_names_tfidf):
            important_features_users.append((feature_names_tfidf[idx], coef))
        elif idx < len(feature_names_tfidf)  + len(feature_names_ngrams):
            important_features_users.append((feature_names_ngrams[idx - len(feature_names_tfidf)], coef))
        else:
            important_features_users.append((f"Embedding_{idx - len(feature_names_tfidf)  - len(feature_names_ngrams)}", coef))


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


### coefficient plot of users
Here, we just merge to get the coefficient-frequency data.
The plot is done by R (coefficent_plot.R)

In [9]:
# get all the features name
all_feature_names_train = np.unique(np.concatenate([feature_names_tfidf, feature_names_ngrams]))
vectorizer = CountVectorizer(vocabulary=all_feature_names_train)
X_all_features = vectorizer.fit_transform(X_temp["Clean_Review"])

# calculate the features frequency
word_doc_frequency = (X_all_features > 0).sum(axis=0) / X_all_features.shape[0]

# convert frequency data to DataFrame
word_freq_df = pd.DataFrame({"word": all_feature_names_train, "doc_frequency": np.array(word_doc_frequency).flatten()})
word_freq_df = word_freq_df.sort_values(by="doc_frequency", ascending=False)

# merge the coefficient date to the frequency data
important_features_df = pd.DataFrame(important_features_users, columns=["word", "importance"])
word_freq_df = word_freq_df.rename(columns={"doc_frequency": "frequency"})
freq_coef_user = pd.merge(important_features_df, word_freq_df, on="word", how="inner")

# save merged data
freq_coef_user.to_csv("freq_coef_user.csv", index=False)
print(freq_coef_user.head())


             word  importance  frequency
0         abandon   -0.001038   0.004019
1            abov    0.000400   0.006402
2         absolut    0.001268   0.048148
3  absolut incred    0.000335   0.000000
4    absolut love    0.000110   0.000000


## Apply users model to Media review
Load media data


In [10]:
media_df = pd.read_csv("myMediaReviews.csv")
media_df.dropna(subset=["Snippet"], inplace=True)
media_df.drop(columns=["OpenCritic URL","Description","Release Date","Review Title","Published Date","Review URL","Language"], inplace=True)


### Filter media review

In [11]:
# Select common games of users and media
media_games = media_df["Game"].unique()
user_games = user_df["Game Name"].unique()
common_games = set(media_games) & set(user_games)
media_df_filtered = media_df[media_df["Game"].isin(user_games)]
media_df_filtered["Game"].nunique()
media_df_filtered.drop(columns=["Tier"], inplace=True)

# Select games with review number over than 50
n = 50
game_counts = media_df_filtered["Game"].value_counts()
games_with_n_or_more_reviews = game_counts[game_counts >= n].index
media_df_filtered_nplus = media_df_filtered[media_df_filtered["Game"].isin(games_with_n_or_more_reviews)]
media_df_filtered_nplus.nunique()
media_df_filtered_nplus.reset_index(drop=True, inplace=True)

# Clean the media review text
media_df = media_df_filtered_nplus
media_df.dropna(subset=["Snippet"], inplace=True)
media_df["Clean_Snippet"] = media_df["Snippet"].apply(clean_text)
media_df["Clean_Snippet"] = media_df["Clean_Snippet"].apply(stem_text)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  media_df_filtered.drop(columns=["Tier"], inplace=True)


### Get the X(features of text) and Y(media score)
 

In [12]:
# normalized Y
y_media = media_df["Score"].values
scaler_y = MinMaxScaler(feature_range=(-1, 1))
y_media_normalized = scaler_y.fit_transform(y_media.reshape(-1, 1))

# get features

# tfidf
X_tfidf_media = tfidf_vectorizer.transform(media_df["Clean_Snippet"])
# ngrams
X_ngrams_media = ngram_vectorizer.transform(media_df["Clean_Snippet"])
# embedding
def tokenize_reviews(reviews):
    return [simple_preprocess(review) for review in reviews]
tokenized_reviews = tokenize_reviews(user_df["Clean_Review"].tolist())

def get_sentence_embedding(review, model, vector_size=100):
    words = simple_preprocess(review)
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    if len(word_vectors) == 0:
        return np.zeros(vector_size)
    return np.mean(word_vectors, axis=0)

model = Word2Vec(sentences=tokenized_reviews, 
                 vector_size=100,
                 window=5, 
                 min_count=5,
                 workers=4)

batch_size = 10000
embeddings_media = []
for i in range(0, len(media_df), batch_size):
    batch_reviews = media_df["Clean_Snippet"].iloc[i:i + batch_size].tolist()
    batch_embeddings = [get_sentence_embedding(review, model) for review in batch_reviews]
    embeddings_media.extend(batch_embeddings)
X_embedding_media = np.array(embeddings_media, dtype=np.float32)
X_embedding_media_sparse = scipy.sparse.csr_matrix(X_embedding_media)

### Apply the lasso model on media data

In [14]:
# input X
X_media_unique = scipy.sparse.hstack([X_tfidf_media,  X_ngrams_media, X_embedding_media_sparse])
# normalize X
scaler_x_filename = 'scaler_x.plk'
scaler_x = pickle.load(open(scaler_x_filename, 'rb'))
X_media_scaled = scaler_x.transform(X_media_unique)


# predict
y_pred_media_lasso = lasso.predict(X_media_scaled)

# calculate mse and rmse
mse_media = mean_squared_error(y_media_normalized, y_pred_media_lasso)
print(f"Mean Squared Error on Media Data (Normalized -1 to 1): {mse_media:.4f}")
rmse_media = np.sqrt(mse_media)
print(f"Root Mean Squared Error on Media Data (Normalized -1 to 1): {rmse_media:.4f}")

y_pred_media_original = scaler_y.inverse_transform(y_pred_media_lasso.reshape(-1, 1))
y_media_original = scaler_y.inverse_transform(y_media_normalized)

# kendall accuracy
acc_transfer_lasso = kendall_acc(y_pred_media_original, y_media_original)
print("Transfer learning lasso kendall accuracy")
print(acc_transfer_lasso)

Mean Squared Error on Media Data (Normalized -1 to 1): 0.0962
Root Mean Squared Error on Media Data (Normalized -1 to 1): 0.3102
Transfer learning lasso kendall accuracy
     acc  lower  upper
0  68.12  67.65  68.59


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


## Train new model on media review

In [15]:
# test and train set
X_temp, _, y_train, y_test = train_test_split(media_df[["Clean_Snippet"]],
                                              media_df["Score"],
                                              test_size=0.2,
                                              random_state=42)
scaler_y = MinMaxScaler()

# normalized y
y_train_normalized = scaler_y.fit_transform(y_train.values.reshape(-1, 1))
y_test_normalized = scaler_y.transform(y_test.values.reshape(-1, 1))

In [16]:
# TFIDF feature
tfidf_vectorizer = TfidfVectorizer(max_features=5000, 
                                   ngram_range=(1, 3), 
                                   stop_words='english', 
                                   min_df=5, 
                                   max_df=0.95)
X_tfidf_train = tfidf_vectorizer.fit_transform(X_temp["Clean_Snippet"])
X_tfidf_test = tfidf_vectorizer.transform(media_df.loc[y_test.index, "Clean_Snippet"])

In [17]:
# Ngram feature
ngram_vectorizer = CountVectorizer(max_features=5000, 
                                   ngram_range=(1, 3), 
                                   stop_words='english', 
                                   min_df=5, 
                                   max_df=0.95)
X_ngrams_train = ngram_vectorizer.fit_transform(X_temp["Clean_Snippet"])
X_ngrams_test = ngram_vectorizer.transform(media_df.loc[y_test.index, "Clean_Snippet"])

In [18]:
# list of media reviwes
def tokenize_reviews(reviews):
    return [simple_preprocess(review) for review in reviews]

tokenized_media_reviews = tokenize_reviews(media_df["Clean_Snippet"].tolist())

In [19]:
# embedding feature
model = Word2Vec(sentences=tokenized_media_reviews, 
                 vector_size=100,
                 window=5, 
                 min_count=5,
                 workers=4)

def get_sentence_embedding(review, model, vector_size=100):
    words = simple_preprocess(review)
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    if len(word_vectors) == 0:
        return np.zeros(vector_size)
    return np.mean(word_vectors, axis=0)

batch_size = 10000
embeddings_train = []
for i in range(0, len(X_temp), batch_size):
    batch_reviews = X_temp["Clean_Snippet"].iloc[i:i + batch_size].tolist()
    batch_embeddings = [get_sentence_embedding(review, model) for review in batch_reviews]
    embeddings_train.extend(batch_embeddings)
X_embedding_train = np.array(embeddings_train, dtype=np.float32)

embeddings_test = []
for i in range(0, len(y_test), batch_size):
    batch_reviews = media_df.loc[y_test.index, "Clean_Snippet"].iloc[i:i + batch_size].tolist()
    batch_embeddings = [get_sentence_embedding(review, model) for review in batch_reviews]
    embeddings_test.extend(batch_embeddings)
X_embedding_test = np.array(embeddings_test, dtype=np.float32)

# build sparse matrix
X_embedding_train_sparse = scipy.sparse.csr_matrix(X_embedding_train)
X_embedding_test_sparse = scipy.sparse.csr_matrix(X_embedding_test)

In [22]:
X_train_combined = scipy.sparse.hstack([X_tfidf_train, X_ngrams_train, X_embedding_train_sparse])
X_test_combined = scipy.sparse.hstack([X_tfidf_test, X_ngrams_test, X_embedding_test_sparse])

scaler_x = StandardScaler(with_mean=False)
X_train_scaled = scaler_x.fit_transform(X_train_combined)
X_test_scaled = scaler_x.transform(X_test_combined)

In [23]:
# train model
lasso = Lasso(alpha=0.001)  
lasso.fit(X_train_scaled, y_train_normalized.flatten())

# save model
filename = 'lasso_model_media.sav'
pickle.dump(lasso, open(filename, 'wb'))

In [26]:
lasso = pickle.load(open('lasso_model_media.sav','rb'))

y_pred = lasso.predict(X_test_scaled)
mse_lasso = mean_squared_error(y_test_normalized, y_pred)
print(f"Mean Squared Error: {mse_lasso:.4f}")

rmse_lasso = np.sqrt(mse_lasso)
print(f"Root Mean Squared Error: {rmse_lasso:.4f}")

feature_names_tfidf = tfidf_vectorizer.get_feature_names_out()
feature_names_ngrams = ngram_vectorizer.get_feature_names_out()

# important features
important_features_meida = []
for idx, coef in enumerate(lasso.coef_):
    if abs(coef) > 0:
        if idx < len(feature_names_tfidf):
            important_features_meida.append((feature_names_tfidf[idx], coef))
        elif idx < len(feature_names_tfidf)  + len(feature_names_ngrams):
            important_features_meida.append((feature_names_ngrams[idx - len(feature_names_tfidf)], coef))
        else:
            important_features_meida.append((f"Embedding_{idx - len(feature_names_tfidf)  - len(feature_names_ngrams)}", coef))


Mean Squared Error: 0.0110
Root Mean Squared Error: 0.1048


In [25]:
y_pred_original = scaler_y.inverse_transform(y_pred.reshape(-1, 1))
print(f"Predicted Media Scores (0-100):")
print(y_pred_original.flatten()[:10])

y_test_original = scaler_y.inverse_transform(y_test_normalized)
print(f"Actual Media Scores (0-100):")
print(y_test_original.flatten()[:10])

# k acc
acc_media_lasso = kendall_acc(y_pred_original, y_test_original)
print(acc_media_lasso)

Predicted Media Scores (0-100):
[86.27726356 86.97206171 85.44281429 83.22377682 77.63991667 79.63552781
 85.40482226 69.75211467 81.97213745 97.23802978]
Actual Media Scores (0-100):
[ 88.  80.  80.  75.  70.  75.  75.  60.  84. 100.]
     acc  lower  upper
0  74.88  73.91  75.85


In [31]:
# get all the features name
all_feature_names_train = np.unique(np.concatenate([feature_names_tfidf, feature_names_ngrams]))
vectorizer = CountVectorizer(vocabulary=all_feature_names_train)
X_all_features = vectorizer.fit_transform(X_temp["Clean_Snippet"])
# calculate the features frequency
word_doc_frequency = (X_all_features > 0).sum(axis=0) / X_all_features.shape[0]

# convert frequency data to DataFrame
word_freq_df = pd.DataFrame({"word": all_feature_names_train, "doc_frequency": np.array(word_doc_frequency).flatten()})
word_freq_df = word_freq_df.sort_values(by="doc_frequency", ascending=False)


# merge the coefficient date to the frequency data
important_features_df = pd.DataFrame(important_features_meida, columns=["word", "importance"])
word_freq_df = word_freq_df.rename(columns={"doc_frequency": "frequency"})
freq_coef_media = pd.merge(important_features_df, word_freq_df, on="word", how="inner")

# save data
freq_coef_media.to_csv("freq_coef_media.csv", index=False)
print(freq_coef_media.head(10))


         word  importance  frequency
0        abil    0.000039   0.012552
1      accept   -0.000073   0.002152
2  accomplish    0.000796   0.005510
3      achiev    0.001440   0.014182
4      actual   -0.000243   0.013399
5          ad    0.000170   0.014671
6       admir   -0.000517   0.002641
7       admit    0.000077   0.001337
8        aegi    0.000045   0.001728
9      afraid    0.000195   0.001761
