In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
import datetime, logging, sys

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

correlation_cutoff=0.0
correlation_min_max=0
correction_factor=1
input_folder='/data/arpit.goel/23_MLWareRecommendation'

logging.info("Starting Script %s"%(' '.join(sys.argv)))
logging.info("Reading Files")
train_file=pd.read_csv(input_folder+'/train_MLWARE2.csv')
test_file=pd.read_csv(input_folder+'/test_MLWARE2.csv')

# train_file=train_file[train_file['userId']<100]
# test_file=test_file[test_file['userId']<100]

user_rating_mean=train_file.groupby('userId')['rating'].mean()
item_rating_mean=train_file.groupby('itemId')['rating'].mean()
avg_rating_mean=train_file['rating'].mean()

u_i_matrix=train_file.groupby(['userId','itemId'])['rating'].max().unstack()
i_u_matrix=u_i_matrix.T
u_i_matrix_norm=csr_matrix((u_i_matrix.T-user_rating_mean).T.fillna(0))
i_u_matrix_norm=csr_matrix((i_u_matrix.T-item_rating_mean).T.fillna(0))




2017-02-25 04:53:15,261 : INFO : Starting Script /data/arpit.goel/anaconda/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py -f /home/arpit.goel/.local/share/jupyter/runtime/kernel-c8ea1643-2a85-4e5d-b55a-06499474b033.json
2017-02-25 04:53:15,263 : INFO : Reading Files


In [2]:
logging.info("Making Similarity Matrices")
user_correlation=cosine_similarity(u_i_matrix_norm)
item_correlation=cosine_similarity(i_u_matrix_norm)

user_correlation=user_correlation*(np.abs(user_correlation)>correlation_cutoff).astype(float)
item_correlation=item_correlation*(np.abs(item_correlation)>correlation_cutoff).astype(float)

if correlation_min_max==1:
    user_correlation=0.5+user_correlation/2.0
    item_correlation=0.5+item_correlation/2.0

user_index=dict(zip(u_i_matrix.index,range(u_i_matrix.index.shape[0])))
item_index=dict(zip(i_u_matrix.index,range(i_u_matrix.index.shape[0])))


2017-02-25 04:54:20,892 : INFO : Making Similarity Matrices


In [None]:
u_u_count=np.dot(u_i_matrix.notnull(),u_i_matrix.notnull().T)
u_u_count>

In [None]:
logging.info("Making Similarity Matrices")
user_correlation=cosine_similarity(u_i_matrix_norm)
item_correlation=cosine_similarity(i_u_matrix_norm)

user_correlation=user_correlation*(np.abs(user_correlation)>correlation_cutoff).astype(float)
item_correlation=item_correlation*(np.abs(item_correlation)>correlation_cutoff).astype(float)

if correlation_min_max==1:
    user_correlation=0.5+user_correlation/2.0
    item_correlation=0.5+item_correlation/2.0

user_index=dict(zip(u_i_matrix.index,range(u_i_matrix.index.shape[0])))
item_index=dict(zip(i_u_matrix.index,range(i_u_matrix.index.shape[0])))

u_rating=[]
i_rating=[]

logging.info("Scoring User Collaborative Filtering")
i=0
for name,group in test_file.groupby('userId'):
    i+=1
    if i%1000==0:
        logging.info("Users Scored:%d"%i)
    a=i_u_matrix.ix[group['itemId']]
    a-= 0 if correction_factor==0 else user_rating_mean
    b=user_correlation[user_index[name]]
    user_interaction_rating=(a*b).sum(axis=1)/(a.notnull()*b).sum(axis=1)
    user_interaction_rating+=0 if correction_factor==0 else user_rating_mean[name]
    user_interaction_rating.index=group['ID']
    u_rating.append(user_interaction_rating)
 
i=0 
for name,group in test_file.groupby('itemId'):
    i+=1
    if i%10==0:
        logging.info("Items Scored:%d"%i)
    a=u_i_matrix.ix[group['userId']]
    a-= 0 if correction_factor==0 else item_rating_mean
    b=item_correlation[item_index[name]]
    item_interaction_rating=(a*b).sum(axis=1)/(a.notnull()*b).sum(axis=1)
    item_interaction_rating+=0 if correction_factor==0 else item_rating_mean[name]
    item_interaction_rating.index=group['ID']
    i_rating.append(item_interaction_rating)
    
recommendations=test_file.copy()
recommendations['u_rating']=np.clip(recommendations['ID'].map(pd.concat(u_rating)),0,10)
recommendations['i_rating']=np.clip(recommendations['ID'].map(pd.concat(i_rating)),0,10)

recommendations['user_min']=recommendations['userId'].map(train_file.groupby('userId')['rating'].min())
recommendations['user_max']=recommendations['userId'].map(train_file.groupby('userId')['rating'].max())
recommendations['item_min']=recommendations['itemId'].map(train_file.groupby('itemId')['rating'].min())
recommendations['item_max']=recommendations['itemId'].map(train_file.groupby('itemId')['rating'].max())

recommendations['u_rating']=recommendations[['u_rating','user_min','item_min']].max(axis=1)
recommendations['u_rating']=recommendations[['u_rating','user_max','item_max']].min(axis=1)
recommendations['i_rating']=recommendations[['i_rating','user_min','item_min']].max(axis=1)
recommendations['i_rating']=recommendations[['i_rating','user_max','item_max']].min(axis=1)

user_interaction=recommendations[['ID','userId','itemId','u_rating']]
user_interaction.loc[:,'rating']=user_interaction['u_rating']
user_interaction[['ID','userId','itemId','rating']].to_csv(input_folder+'/test_scores/collaborative_filtering_user_%s_%s_%s.csv'%(sys.argv[1],sys.argv[2],sys.argv[3]),index=False)

item_interaction=recommendations[['ID','userId','itemId','i_rating']]
item_interaction.loc[:,'rating']=item_interaction['i_rating']
item_interaction[['ID','userId','itemId','rating']].to_csv(input_folder+'/test_scores/collaborative_filtering_item_%s_%s_%s.csv'%(sys.argv[1],sys.argv[2],sys.argv[3]),index=False)
