# <center>Intro To Intelligent Systems</center>
## <center>Xbox Recommendation System</center>
## <center>Anudeep, Manasa, Abhishek</center>

Importing the required libraries

In [132]:
pip install tqdm


Note: you may need to restart the kernel to use updated packages.


In [149]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error

Loading the dataset

In [96]:
train_df = pd.read_csv("acm-sf-chapter-hackathon-small/train.csv")
test_df = pd.read_csv("acm-sf-chapter-hackathon-small/test.csv")

Data Cleaning

In [97]:
# Handling missing values

In [98]:
train_df.isnull().sum()

user          0
sku           0
category      0
query         0
click_time    0
query_time    0
dtype: int64

In [99]:
test_df.isnull().sum()

user          0
category      0
query         0
click_time    0
query_time    0
dtype: int64

In [100]:
#Since there is no null values, 
#we have to check for uniques values 
#in every column

In [101]:
for column in train_df.columns:
    unique_values = train_df[column].unique()
    print(f"Unique values in '{column}': {unique_values}")

Unique values in 'user': ['0001cd0d10bbc585c9ba287c963e00873d4c0bfd'
 '00033dbced6acd3626c4b56ff5c55b8d69911681'
 '0007756f015345450f7be1df33695421466b7ce4' ...
 'fffa393d127dec90b7eae4718535bd16be3b394d'
 'fffd288ec29a96dbac7356bcda0a1e9f88255a5b'
 'ffff8ecfaaf3f1fab67c8f6739c131ec818dd4a6']
Unique values in 'sku': [         2032076          9854804          2670133          9984142
          2541184          3046066          2977637          9328943
          1180104          2598445          1563461          1012721
          2173065          1228939          2375195          2480232
          2467183          1814093          2807036          2095189
          2945052          9487067          9955514          2833031
          9902347          2633149          3001046          1776209
          2633103          9254111          9374134          2704058
          1251132          2613542          8814811          2842639
          2107458          2467129          2330703          

In [102]:
for column in test_df.columns:
    unique_values = test_df[column].unique()
    print(f"Unique values in '{column}': {unique_values}")

Unique values in 'user': ['00025eb02b249434554fe2cacd8562db325df127'
 '00033dbced6acd3626c4b56ff5c55b8d69911681'
 '000548d17532b70071b7d59edd4797aed1823c60' ...
 'fffa95be88e8210312518a884d14e39e13d727eb'
 'fffb154d387316747181e5d647c0b7eac8f6e064'
 'fffd288ec29a96dbac7356bcda0a1e9f88255a5b']
Unique values in 'category': ['abcat0701002']
Unique values in 'query': ['child eden' 'Revelations' 'Gears of war' ... 'Duke'
 'Arkham city controller' 'Modern warfare3 xbox']
Unique values in 'click_time': ['2011-09-02 13:05:24.146' '2011-09-25 13:38:14.74'
 '2011-09-16 17:34:43.596' ... '2011-10-25 17:41:14.568'
 '2011-09-14 11:16:11.579' '2011-10-10 08:47:57.554']
Unique values in 'query_time': ['2011-09-02 13:04:34.633' '2011-09-25 13:37:58.382'
 '2011-09-16 17:33:43.237' ... '2011-10-25 17:41:05.683'
 '2011-09-14 11:14:41.861' '2011-10-10 08:43:56.768']


In [103]:
for column in train_df.columns:
    unique_values = train_df[column].nunique()
    print(f"Unique values in '{column}': {unique_values}")

Unique values in 'user': 38024
Unique values in 'sku': 413
Unique values in 'category': 1
Unique values in 'query': 5960
Unique values in 'click_time': 42364
Unique values in 'query_time': 40385


In [104]:
for column in test_df.columns:
    unique_values = test_df[column].nunique()
    print(f"Unique values in '{column}': {unique_values}")

Unique values in 'user': 26196
Unique values in 'category': 1
Unique values in 'query': 4605
Unique values in 'click_time': 28240
Unique values in 'query_time': 27324


In [105]:
#As we can see that category column has only 1 value,
#we can drop it.


In [106]:
#Standardize the text in the dataset

In [107]:
train_df.dtypes

user          object
sku            int64
category      object
query         object
click_time    object
query_time    object
dtype: object

In [108]:
#query column

In [109]:
def standardize_text(df, column_name):
    df[column_name] = df[column_name].str.lower()  
    df[column_name] = df[column_name].str.replace('[^\w\s]', '', regex=True)

In [110]:
standardize_text(train_df, 'query')
standardize_text(test_df, 'query')

In [111]:
#convert click_time and query_time to datetime format

In [112]:
train_df['click_time'] = pd.to_datetime(train_df['click_time'])
train_df['query_time'] = pd.to_datetime(train_df['query_time'])

In [113]:
test_df['click_time'] = pd.to_datetime(test_df['click_time'])
test_df['query_time'] = pd.to_datetime(test_df['query_time'])

In [114]:
#Preprocessing

In [115]:
#Tokenization

In [116]:
# # Initialize the CountVectorizer
# vectorizer = CountVectorizer()

# # Fit and transform the 'query' column
# X_train_tokens = vectorizer.fit_transform(train_df['query'])
# X_test_tokens = vectorizer.transform(test_df['query'])

In [117]:
#Vectorization

In [118]:
# # Initialize the TfidfTransformer
# tfidf_transformer = TfidfTransformer()

# # Transform the tokenized data
# X_train_tfidf = tfidf_transformer.fit_transform(X_train_tokens)
# X_test_tfidf = tfidf_transformer.transform(X_test_tokens)

In [121]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
X_tfidf = tfidf_vectorizer.fit_transform(train_df['query'])

In [122]:
#Feature Engineering

In [123]:
#Extracting hour of day from 'click_time'
train_df['click_time'] = pd.to_datetime(train_df['click_time'])
train_df['hour_of_day'] = train_df['click_time'].dt.hour

#Encoding categorical data (Example: 'user')
train_df['user'] = train_df['user'].astype('category')
train_df['user_cat'] = train_df['user'].cat.codes


In [124]:
#Handling Sparse Data

In [125]:
X_tfidf.shape

(42365, 2640)

In [126]:
svd = TruncatedSVD(n_components=100)  
X_tfidf_reduced = svd.fit_transform(X_tfidf)

In [127]:
#Data Splitting

In [128]:
y = train_df['sku']  
X_train, X_val, y_train, y_val = train_test_split(X_tfidf_reduced, y, test_size=0.2, random_state=42)

In [129]:
#Model Training

In [130]:
#Content based filtering

In [84]:
# Initialize a Random Forest Classifier
model = RandomForestClassifier(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict on the validation set
y_pred = model.predict(X_val)

# Evaluate the model
print(classification_report(y_val, y_pred))   

                  precision    recall  f1-score   support

         1004622       0.42      0.83      0.56         6
         1010544       0.25      0.50      0.33         2
         1011491       1.00      0.50      0.67        24
         1011831       0.00      0.00      0.00         2
         1012721       0.87      0.91      0.89        22
         1013666       0.00      0.00      0.00         9
         1032361       0.69      0.81      0.74        36
         1052221       0.50      0.33      0.40         3
         1066233       0.00      0.00      0.00         4
         1066515       0.57      1.00      0.73         4
         1066551       0.42      0.79      0.55        14
         1067848       1.00      0.71      0.83         7
         1067948       0.50      1.00      0.67         1
         1078792       0.49      0.87      0.62        23
         1092494       0.92      0.92      0.92        36
         1094401       0.00      0.00      0.00         1
         1121

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [85]:
#Hyperparameter tuning

In [93]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Initialize the classifier
rf = RandomForestClassifier(random_state=42)

# Define the parameters grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize Grid Search with cross-validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=1, verbose=2)

try:
    # Fit the grid search to the data
    grid_search.fit(X_train, y_train)

    # Get the best parameters
    best_params = grid_search.best_params_
    print("Best parameters:", best_params)

    # Train the model with the best parameters
    best_rf = RandomForestClassifier(**best_params, random_state=42)
    best_rf.fit(X_train, y_train)

    # Evaluate the model
    y_pred = best_rf.predict(X_val)
    print(classification_report(y_val, y_pred))

except Exception as e:
    print("An error occurred:", e)


Fitting 3 folds for each of 81 candidates, totalling 243 fits




[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   4.5s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   4.5s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   4.4s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   8.9s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   8.9s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   9.0s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  17.8s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  17.8s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  17.8s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=  

[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   7.4s
[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   7.5s
[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  14.9s
[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  14.8s
[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  14.8s
[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  29.5s
[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  29.6s
[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  29.6s
[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   7.3s
[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=  

[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=  15.6s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  31.5s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  30.9s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  31.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.1min
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.0min
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.0min
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=  15.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=  15.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimato

                  precision    recall  f1-score   support

         1004622       0.42      0.83      0.56         6
         1010544       0.33      0.50      0.40         2
         1011491       1.00      0.50      0.67        24
         1011831       0.00      0.00      0.00         2
         1012721       0.95      0.91      0.93        22
         1013666       0.00      0.00      0.00         9
         1032361       0.71      0.81      0.75        36
         1052221       0.33      0.33      0.33         3
         1066233       0.00      0.00      0.00         4
         1066515       0.57      1.00      0.73         4
         1066551       0.40      0.86      0.55        14
         1067848       1.00      0.71      0.83         7
         1067948       0.50      1.00      0.67         1
         1078792       0.49      0.87      0.62        23
         1092494       0.94      0.92      0.93        36
         1094401       0.00      0.00      0.00         1
         1121

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [131]:
# from sklearn.ensemble import GradientBoostingClassifier

# # Initialize a Gradient Boosting Classifier
# gb_model = GradientBoostingClassifier(random_state=42)

# # Train the model
# gb_model.fit(X_train, y_train)

# # Predict on the validation set
# y_gb_pred = gb_model.predict(X_val)

# # Evaluate the model
# print("Gradient Boosting Classifier:")
# print(classification_report(y_val, y_gb_pred))


KeyboardInterrupt: 

In [133]:
# from sklearn.ensemble import GradientBoostingClassifier
# from sklearn.metrics import classification_report
# from tqdm import tqdm
# import time

# # Initialize a Gradient Boosting Classifier
# gb_model = GradientBoostingClassifier(random_state=42)

# # Optional: Wrap your training data in tqdm for a progress bar
# for i in tqdm(range(1), desc="Training Model"):
#     gb_model.fit(X_train, y_train)
#     time.sleep(0.1)  # Just to show the progress, can be removed

# # Predict on the validation set
# y_gb_pred = gb_model.predict(X_val)

# # Evaluate the model
# print("Gradient Boosting Classifier:")
# print(classification_report(y_val, y_gb_pred))


Training Model:   0%|                                       | 0/1 [18:03<?, ?it/s]


KeyboardInterrupt: 

In [None]:
# from sklearn.model_selection import GridSearchCV

# # Initialize the classifier
# gb = GradientBoostingClassifier(random_state=42)

# # Define the parameters grid
# gb_param_grid = {
#     'n_estimators': [50, 100, 200],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'max_depth': [3, 5, 7],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4]
# }

# # Initialize Grid Search with cross-validation
# gb_grid_search = GridSearchCV(estimator=gb, param_grid=gb_param_grid, cv=3, n_jobs=1, verbose=2)

# try:
#     # Fit the grid search to the data
#     gb_grid_search.fit(X_train, y_train)

#     # Get the best parameters
#     best_gb_params = gb_grid_search.best_params_
#     print("Best parameters for Gradient Boosting:", best_gb_params)

#     # Train the model with the best parameters
#     best_gb = GradientBoostingClassifier(**best_gb_params, random_state=42)
#     best_gb.fit(X_train, y_train)

#     # Evaluate the model
#     y_gb_pred = best_gb.predict(X_val)
#     print("Gradient Boosting Classifier with best parameters:")
#     print(classification_report(y_val, y_gb_pred))

# except Exception as e:
#     print("An error occurred:", e)


In [134]:
#Matrix Factorization

In [139]:
#new matrix factorization validation


In [140]:
from sklearn.model_selection import train_test_split

# Split the dataset into training (80%) and testing (20%) sets
train_df_split, test_df_split = train_test_split(train_df, test_size=0.2, random_state=42)


In [141]:
# Training set interaction matrix
train_interaction_matrix = pd.pivot_table(train_df_split, index='user', columns='sku', aggfunc='size', fill_value=0)

# Testing set interaction matrix
test_interaction_matrix = pd.pivot_table(test_df_split, index='user', columns='sku', aggfunc='size', fill_value=0)


In [142]:
# Initialize SVD
svd = TruncatedSVD(n_components=20)

# Fit SVD on the training interaction matrix
train_matrix_reduced = svd.fit_transform(train_interaction_matrix)


In [144]:
# Identify common users and items in both training and testing sets
common_users = test_interaction_matrix.index.intersection(train_interaction_matrix.index)
common_items = test_interaction_matrix.columns.intersection(train_interaction_matrix.columns)


In [145]:
# Reconstruct the full interaction matrix using the SVD model
full_predicted_scores = svd.inverse_transform(train_matrix_reduced)
full_predicted_interaction_matrix = pd.DataFrame(full_predicted_scores, index=train_interaction_matrix.index, columns=train_interaction_matrix.columns)


In [146]:
# Extract predictions for common users and items
predicted_interaction_common = full_predicted_interaction_matrix.loc[common_users, common_items]


In [147]:
# Define a threshold for binary prediction
threshold = 0.5

# Apply the threshold for binary prediction
y_pred_common = (predicted_interaction_common > threshold).astype(int)

# Re-define the actual values from the test interaction matrix for common users and items
y_true_common = test_interaction_matrix.loc[common_users, common_items]


In [150]:
# Calculate RMSE
rmse_common = np.sqrt(mean_squared_error(y_true_common, y_pred_common))

print("RMSE: ", rmse_common)

RMSE:  0.07581133134981054


Interpreting RMSE:

A lower RMSE value generally indicates better model performance. However, the acceptability of the RMSE value depends on the context and the specific domain.

It is a better fitting model.

Since our original interaction data is binary (0 or 1), an RMSE of around 0.076 suggests that on average, the model's predictions are close to the actual values.

In [160]:
#Tuning the SVD Components
def evaluate_svd(n_components, train_matrix, test_matrix):
    svd = TruncatedSVD(n_components=n_components, random_state=42)
    train_matrix_reduced = svd.fit_transform(train_matrix)

    # Reconstruct the interaction matrix for the training set
    full_predicted_scores = svd.inverse_transform(train_matrix_reduced)
    full_predicted_interaction_matrix = pd.DataFrame(full_predicted_scores, index=train_matrix.index, columns=train_matrix.columns)

    # Identify common users and items in both training and testing sets
    common_users = test_matrix.index.intersection(train_matrix.index)
    common_items = test_matrix.columns.intersection(train_matrix.columns)

    # Extract the relevant portion of the predicted matrix
    predicted_common = full_predicted_interaction_matrix.loc[common_users, common_items]
    actual_common = test_matrix.loc[common_users, common_items]

    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(actual_common, predicted_common))
    return rmse

In [161]:
#Try different numbers of components
component_options = [10, 20, 50, 100, 200]
for n_components in component_options:
    rmse = evaluate_svd(n_components, train_interaction_matrix, test_interaction_matrix)
    print(f"RMSE with {n_components} components: {rmse}")


RMSE with 10 components: 0.0701678868250465
RMSE with 20 components: 0.07623755957832196
RMSE with 50 components: 0.08051888872666686
RMSE with 100 components: 0.08395777672452677
RMSE with 200 components: 0.08609994291267588


In [162]:
#Threshold Optimization
def evaluate_threshold(threshold, predicted_matrix, actual_matrix):
    # Identifying common users and items in both matrices
    common_users = actual_matrix.index.intersection(predicted_matrix.index)
    common_items = actual_matrix.columns.intersection(predicted_matrix.columns)

    # Extracting the relevant parts of the matrices
    y_pred_common = (predicted_matrix.loc[common_users, common_items] > threshold).astype(int)
    y_true_common = actual_matrix.loc[common_users, common_items]

    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_true_common, y_pred_common))
    return rmse



In [163]:
# Test different thresholds
threshold_options = [0.3, 0.5, 0.7]
for threshold in threshold_options:
    rmse = evaluate_threshold(threshold, predicted_interaction_common, test_interaction_matrix)
    print(f"RMSE with threshold {threshold}: {rmse}")


RMSE with threshold 0.3: 0.07581133134981054
RMSE with threshold 0.5: 0.07581133134981054
RMSE with threshold 0.7: 0.07581133134981054


In [166]:
from sklearn.model_selection import KFold

def cross_validate_svd(data, n_splits=5, n_components=20):
    kf = KFold(n_splits=n_splits)
    rmse_scores = []

    for train_indices, test_indices in kf.split(data):
        # Splitting the data
        train_df_cv, test_df_cv = data.iloc[train_indices], data.iloc[test_indices]

        # Create interaction matrices
        train_matrix_cv = pd.pivot_table(train_df_cv, index='user', columns='sku', aggfunc='size', fill_value=0)
        test_matrix_cv = pd.pivot_table(test_df_cv, index='user', columns='sku', aggfunc='size', fill_value=0)

        # Check if there are common users and items in the fold
        common_users = test_matrix_cv.index.intersection(train_matrix_cv.index)
        common_items = test_matrix_cv.columns.intersection(train_matrix_cv.columns)
        
        if len(common_users) > 0 and len(common_items) > 0:
            # Evaluate SVD
            rmse = evaluate_svd(n_components, train_matrix_cv, test_matrix_cv)
            rmse_scores.append(rmse)

    if len(rmse_scores) > 0:
        return np.mean(rmse_scores)
    else:
        return None






In [167]:
# Perform cross-validation
avg_rmse = cross_validate_svd(train_df, n_splits=5, n_components=20)
if avg_rmse is not None:
    print(f"Average RMSE across folds: {avg_rmse}")
else:
    print("Insufficient common users/items in folds to perform evaluation.")

Insufficient common users/items in folds to perform evaluation.


In [171]:
avg_rmse = cross_validate_svd(train_df, n_splits=3, n_components=20)
if avg_rmse is not None:
    print(f"Average RMSE across folds: {avg_rmse}")
else:
    print("Insufficient common users/items in folds to perform evaluation.")

Insufficient common users/items in folds to perform evaluation.


In [169]:
avg_rmse = cross_validate_svd(train_df, n_splits=2, n_components=20)
if avg_rmse is not None:
    print(f"Average RMSE across folds: {avg_rmse}")
else:
    print("Insufficient common users/items in folds to perform evaluation.")

Average RMSE across folds: 0.08422726719656565


In [153]:
# Generate top-5 recommendations for each user
def top_n_recommendations(predicted_matrix, original_matrix, N=5):
    recommendations = {}

    for user in predicted_matrix.index:
        # Items interacted with by the user in the original matrix
        interacted_items = set(original_matrix.loc[user].loc[original_matrix.loc[user] > 0].index)

        # Predicted scores for items
        predicted_items = predicted_matrix.loc[user]

        # Filtering out items that were interacted with
        non_interacted_items = predicted_items[~predicted_items.index.isin(interacted_items)]

        # Top N items with the highest predicted score
        top_n_items = non_interacted_items.sort_values(ascending=False).head(N).index.tolist()

        recommendations[user] = top_n_items

    return recommendations



In [154]:
top_5_recommendations = top_n_recommendations(predicted_interaction_common, test_interaction_matrix, N=5)

In [155]:
top_5_recommendations

{'00033dbced6acd3626c4b56ff5c55b8d69911681': [2670133,
  9889193,
  2856517,
  2095189,
  2467183],
 '00835b02c8ed6e85348491471acc09119bb156d2': [2633103,
  2628429,
  2212043,
  1251132,
  2613542],
 '010b8d80387af7339c7cbdf93a28b7a7330851c4': [3046066,
  2633149,
  9713872,
  2897116,
  2467183],
 '010c8a8a77f4b316c61fba6f1f59dc918048d6fd': [3046066,
  1228939,
  9854804,
  2670133,
  2467183],
 '011ad82bba9796db701adf8e4f1ece86bb72e43b': [1228939,
  2856544,
  2856517,
  2758085,
  1251132],
 '015ba666ca9feda7a063299327b113fc51e30174': [3046066,
  1228939,
  9713872,
  2897116,
  2807036],
 '015ff875976f0c3006937204c8500e0ed4100779': [2953607,
  3001046,
  9460736,
  2613542,
  9854668],
 '016d52f919ceaa148fa5e6eec821006e8dcd89ac': [2467183,
  2032076,
  2613542,
  9984142,
  3244621],
 '01b4a53b5dbd97633b37cbc14a3d7c74dd2d516c': [2633103,
  2628429,
  2212043,
  1251132,
  2613542],
 '01bd8e0aa569fb4fb1df82c030a1bf6720d48b70': [3046066,
  2095189,
  1228939,
  2633103,
  9902347],


In [172]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import TruncatedSVD

# Load the training and test datasets
train_df = pd.read_csv("acm-sf-chapter-hackathon-small/train.csv")
test_df = pd.read_csv("acm-sf-chapter-hackathon-small/test.csv")

# Create interaction matrices for the training and test datasets
train_interaction_matrix = pd.pivot_table(train_df, index='user', columns='sku', aggfunc='size', fill_value=0)
test_interaction_matrix = pd.pivot_table(test_df, index='user', columns='sku', aggfunc='size', fill_value=0)

# Train the SVD model on the training interaction matrix
n_components = 20  # This can be tuned
svd = TruncatedSVD(n_components=n_components, random_state=42)
train_matrix_reduced = svd.fit_transform(train_interaction_matrix)

# Predict interactions for the test dataset
# Handling the fact that the test set might contain users or items not present in the training set
common_users = test_interaction_matrix.index.intersection(train_interaction_matrix.index)
common_items = test_interaction_matrix.columns.intersection(train_interaction_matrix.columns)
predicted_scores_test = svd.inverse_transform(train_matrix_reduced)[:, common_items.tolist()]
predicted_interaction_matrix_test = pd.DataFrame(predicted_scores_test, index=common_users, columns=common_items)

# Evaluate the model on the test dataset using RMSE
threshold = 0.5  # This threshold can be tuned
y_pred_test = (predicted_interaction_matrix_test > threshold).astype(int)
y_true_test = test_interaction_matrix.loc[common_users, common_items]
rmse_test = np.sqrt(mean_squared_error(y_true_test, y_pred_test))

print("RMSE on Test Dataset: ", rmse_test)


KeyError: 'sku'

In [173]:
print(test_df.head())

                                       user      category  \
0  00025eb02b249434554fe2cacd8562db325df127  abcat0701002   
1  00033dbced6acd3626c4b56ff5c55b8d69911681  abcat0701002   
2  000548d17532b70071b7d59edd4797aed1823c60  abcat0701002   
3  0006f15231a422156a9d005735d0969a5e5a0ac4  abcat0701002   
4  000a16ce5371b0fb3ad0c7f6183a5476b434a95b  abcat0701002   

                          query               click_time  \
0                    child eden  2011-09-02 13:05:24.146   
1                   Revelations   2011-09-25 13:38:14.74   
2                  Gears of war  2011-09-16 17:34:43.596   
3                        batman  2011-10-09 11:14:48.352   
4  Assassins creed: revelations  2011-09-05 00:31:53.083   

                query_time  
0  2011-09-02 13:04:34.633  
1  2011-09-25 13:37:58.382  
2  2011-09-16 17:33:43.237  
3  2011-10-09 11:14:33.692  
4  2011-09-05 00:31:39.761  


In [175]:
import pandas as pd
import numpy as np
import re
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import TruncatedSVD

# Function to standardize text in the 'query' column
def standardize_text(df, column_name):
    df[column_name] = df[column_name].str.lower()
    df[column_name] = df[column_name].apply(lambda x: re.sub(r'[^\w\s]', '', x))

# Load the training and test datasets
train_df = pd.read_csv("acm-sf-chapter-hackathon-small/train.csv")
test_df = pd.read_csv("acm-sf-chapter-hackathon-small/test.csv")
    
# Standardize the 'query' column in both datasets
standardize_text(train_df, 'query')
standardize_text(test_df, 'query')

# Create interaction matrices for the training and test datasets using 'query' as a proxy for items
train_interaction_matrix = pd.pivot_table(train_df, index='user', columns='query', aggfunc='size', fill_value=0)
test_interaction_matrix = pd.pivot_table(test_df, index='user', columns='query', aggfunc='size', fill_value=0)

# Train the SVD model on the training interaction matrix
n_components = 20  # This can be tuned
svd = TruncatedSVD(n_components=n_components, random_state=42)
train_matrix_reduced = svd.fit_transform(train_interaction_matrix)

# Predict interactions for the test dataset
common_users = test_interaction_matrix.index.intersection(train_interaction_matrix.index)
common_queries = test_interaction_matrix.columns.intersection(train_interaction_matrix.columns)
predicted_scores_test = svd.inverse_transform(train_matrix_reduced)[:, common_queries.tolist()]
predicted_interaction_matrix_test = pd.DataFrame(predicted_scores_test, index=common_users, columns=common_queries)

# Evaluate the model on the test dataset using RMSE
threshold = 0.5  # This threshold can be tuned
y_pred_test = (predicted_interaction_matrix_test > threshold).astype(int)
y_true_test = test_interaction_matrix.loc[common_users, common_queries]
rmse_test = np.sqrt(mean_squared_error(y_true_test, y_pred_test))

print("RMSE on Test Dataset: ", rmse_test)


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [176]:
##testing our random forest classifier

In [177]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# Function to standardize text
def standardize_text(df, column_name):
    df[column_name] = df[column_name].str.lower()
    df[column_name] = df[column_name].str.replace('[^\w\s]', '', regex=True)

# Load the datasets
train_df = pd.read_csv('acm-sf-chapter-hackathon-small/train.csv')
test_df = pd.read_csv('acm-sf-chapter-hackathon-small/test.csv')

# Standardize the 'query' column in both datasets
standardize_text(train_df, 'query')
standardize_text(test_df, 'query')

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
X_tfidf = tfidf_vectorizer.fit_transform(train_df['query'])

# Dimensionality Reduction
svd = TruncatedSVD(n_components=100)
X_tfidf_reduced = svd.fit_transform(X_tfidf)

# Data Splitting
y = train_df['sku']
X_train, X_val, y_train, y_val = train_test_split(X_tfidf_reduced, y, test_size=0.2, random_state=42)

# Model Training
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Validate the model
y_val_pred = model.predict(X_val)
print(classification_report(y_val, y_val_pred))

# Transform the 'query' column in the test dataset
X_test_tfidf = tfidf_vectorizer.transform(test_df['query'])

# Reduce dimensions for the test data
X_test_tfidf_reduced = svd.transform(X_test_tfidf)

# Predict using the trained model
y_test_pred = model.predict(X_test_tfidf_reduced)

# Analyzing the predicted SKUs
predicted_sku_counts = pd.Series(y_test_pred).value_counts()
print(predicted_sku_counts.head())  # Display the top 5 most frequently predicted SKUs


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                  precision    recall  f1-score   support

         1004622       0.42      0.83      0.56         6
         1010544       0.50      0.50      0.50         2
         1011491       1.00      0.50      0.67        24
         1011831       0.00      0.00      0.00         2
         1012721       0.87      0.91      0.89        22
         1012876       0.00      0.00      0.00         0
         1013666       0.00      0.00      0.00         9
         1032361       0.69      0.81      0.74        36
         1052221       0.50      0.33      0.40         3
         1066233       0.00      0.00      0.00         4
         1066515       0.50      0.75      0.60         4
         1066551       0.44      0.79      0.56        14
         1067848       1.00      0.71      0.83         7
         1067948       0.50      1.00      0.67         1
         1078792       0.48      0.87      0.62        23
         1092494       0.94      0.92      0.93        36
         1094

9854804    3284
2107458    2286
2173065    2062
2541184    1608
2945052    1563
dtype: int64


In [178]:
import xml.etree.ElementTree as ET
import pandas as pd

# Load and parse the XML file
tree = ET.parse('acm-sf-chapter-hackathon-small/small_product_data.xml')
root = tree.getroot()



In [187]:
import xml.etree.ElementTree as ET
import pandas as pd

# Function to safely extract text from an XML element
def get_text(element, tag):
    found_element = element.find(tag)
    return found_element.text if found_element is not None else None

# Extract product information
products = []
for product in root.findall('product'):
    product_data = {
        'sku': get_text(product, 'sku'),
        'name': get_text(product, 'name'),
        'categoryPath': get_text(product, 'categoryPath'),
        'shortDescription': get_text(product, 'shortDescription'),
        'customerReviewAverage': get_text(product, 'customerReviewAverage'),
        'manufacturer': get_text(product, 'manufacturer')
    }
    products.append(product_data)

# Convert to DataFrame
product_df = pd.DataFrame(products)


In [188]:
product_df

Unnamed: 0,sku,name,categoryPath,shortDescription,customerReviewAverage,manufacturer
0,1004622,Sniper: Ghost Warrior - Xbox 360,\n,Control the power of death as you ghost throug...,3.4,City Interactive
1,1010544,Monopoly Streets - Xbox 360,\n,Who says monopolies are illegal?,4.0,Electronic Arts
2,1011067,MySims: SkyHeroes - Xbox 360,\n,Take to the skies for high-flying action,2.0,Electronic Arts
3,1011491,FIFA Soccer 11 - Xbox 360,\n,Soccer action just got a lot more personal,4.6,EA SPORTS
4,1011831,Hasbro Family Game Night 3 - Xbox 360,\n,Take the fun to the amusement park,3.5,Electronic Arts
...,...,...,...,...,...,...
869,9977237,Shaun White Skateboarding - Xbox 360,\n,Grab your deck as you leave the snowy mountain...,3.0,Ubisoft Entertainment
870,9980886,Star Wars: The Force Unleashed - Xbox 360,\n,Step into the shoes of Darth Vader's secret ap...,4.5,LucasArts
871,9984142,Assassin's Creed Brotherhood - Xbox 360,\n,Join the Brotherhood,4.8,Ubisoft Entertainment
872,9999169100050027,Digital Only Bundle,\n,,,


In [190]:
product_df.isnull().sum()

sku                        0
name                       0
categoryPath               0
shortDescription           4
customerReviewAverage    156
manufacturer               2
dtype: int64

In [192]:
for column in product_df.columns:
    unique_values = product_df[column].nunique()
    print(f"Unique values in '{column}': {unique_values}")

Unique values in 'sku': 437
Unique values in 'name': 434
Unique values in 'categoryPath': 1
Unique values in 'shortDescription': 418
Unique values in 'customerReviewAverage': 37
Unique values in 'manufacturer': 56


In [193]:
product_df.shape

(874, 6)

In [194]:
product_df.to_csv('product_df.csv', index=False)

In [195]:
#preprocessing

In [196]:
product_df['shortDescription'].fillna('Unknown', inplace=True)
product_df['manufacturer'].fillna('Unknown', inplace=True)
product_df.drop('categoryPath', axis=1, inplace=True)

In [197]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Example of creating TF-IDF features from 'shortDescription'
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_features = tfidf_vectorizer.fit_transform(product_df['shortDescription'])

# Convert to DataFrame (optional, for merging with other data)
tfidf_df = pd.DataFrame(tfidf_features.toarray(), index=product_df.index)


In [198]:
# Merge the interaction data with the product data
interaction_df = pd.read_csv('acm-sf-chapter-hackathon-small/train.csv')  # Replace with the correct path
merged_df = interaction_df.merge(product_df, on='sku', how='left')

# Merge TF-IDF features if needed
merged_df = merged_df.join(tfidf_df, on=product_df.index)


ValueError: You are trying to merge on int64 and object columns. If you wish to proceed you should use pd.concat