In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import os
from scipy.sparse import coo_matrix

In [None]:
trainData = pd.read_csv('data/marketing_sample_for_walmart_com-walmart_com_product_review__20200701_20201231__5k_data.tsv', sep='\t')
trainData.shape

In [None]:
trainData.columns

In [None]:
trainData = trainData[['Uniq Id', 'Product Id', 'Product Rating', 'Product Reviews Count', 'Product Category', 'Product Brand', 'Product Name', 'Product Image Url', 'Product Description', 'Product Tags']]
trainData.head()

In [None]:
trainData.shape

In [None]:
trainData.isnull().sum()

In [None]:
trainData['Product Rating'] = trainData['Product Rating'].fillna(0)
trainData['Product Reviews Count'] = trainData['Product Reviews Count'].fillna(0)
trainData['Product Category'] = trainData['Product Category'].fillna('')
trainData['Product Brand'] = trainData['Product Brand'].fillna('')
trainData['Product Description'] = trainData['Product Description'].fillna('')

In [None]:
trainData.isnull().sum()

In [None]:
print(trainData.duplicated().sum())

In [None]:
trainData.columns

In [None]:
columnNameMapping = {
    'Uniq Id': 'ID',
    'Product Id': 'prodID',
    'Product Rating': 'Rating',
    'Product Reviews Count': 'reviewCount',
    'Product Category': 'Category',
    'Product Brand': 'Brand',
    'Product Name': 'Name',
    'Product Image Url': 'imageURL', 
    'Product Description': 'Description', 
    'Product Tags': 'Tags'
}

trainData.rename(columns=columnNameMapping, inplace=True)

In [None]:
trainData.columns

In [None]:
trainData.head(3)

In [None]:
trainData['ID'] = trainData['ID'].str.extract(r'(\d+)').astype(float)
trainData['prodID'] = trainData['prodID'].str.extract(r'(\d+)').astype(float)

In [None]:
numUsers = trainData['ID'].nunique()
numItems = trainData['prodID'].nunique()
numRatings = trainData['Rating'].nunique()
print(f"Number of unique users: {numUsers}")
print(f"Number of unique items: {numItems}")
print(f"Number of unique ratings: {numRatings}")

In [None]:
heatmapData = trainData.pivot_table('ID', 'Rating')

plt.figure(figsize=(8, 6))
sns.heatmap(heatmapData, annot=True, fmt='g', cmap='coolwarm', cbar=True)
plt.title('Heatmap of user Ratings')
plt.xlabel('Ratings')
plt.ylabel('User ID')
plt.show()

In [None]:
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
trainData['ID'].value_counts().hist(bins=10, edgecolor='k')
plt.xlabel('Interactions per User')
plt.ylabel('Number of Users')
plt.title('Distribution of Interactions per User')

plt.subplot(1, 2, 2)
trainData['prodID'].value_counts().hist(bins=10, edgecolor='k', color='green')
plt.xlabel('Interactions per Item')
plt.ylabel('Number of Items')
plt.title('Distribution of Interactions per Item')

plt.tight_layout()
plt.show()

In [None]:
popularItems = trainData['prodID'].value_counts().head(5)
popularItems.plot(kind='bar', color='red')
plt.title("Most Popular Items")

In [None]:
trainData['Rating'].value_counts().plot(kind='bar', color='red')