In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Step 2: Load the CSV data
df = pd.read_csv('test_paper.csv')

# Display the first few rows of the dataframe
print(df.head())

df.fillna(0, inplace=True)
# Step 3: Create a user-item matrix
user_item_matrix = df.pivot_table(values='rating', index='user_id', columns='paper_id', fill_value = 0)

# Display the user-item matrix
print(user_item_matrix)

   user_id  paper_id  rating
0     8103      9979       1
1     2617      7966       2
2     3993      7792       2
3     7785      5646       3
4     4437      6430       4
paper_id  5020  5041  5086  5091  5098  5178  5222  5240  5245  5283  ...  \
user_id                                                               ...   
1051       0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
1113       0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
1215       0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
1232       0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
1294       0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
...        ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   
9776       0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
9782       0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
9809       0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0

In [2]:
# Step 4: Calculate cosine similarity
cosine_sim = cosine_similarity(user_item_matrix)
cosine_sim_df = pd.DataFrame(cosine_sim, index=user_item_matrix.index, columns=user_item_matrix.index)

# Display the cosine similarity matrix
print(cosine_sim_df)

user_id  1051  1113  1215  1232  1294  1363  1365  1410  1464  1518  ...  \
user_id                                                              ...   
1051      1.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
1113      0.0   1.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
1215      0.0   0.0   1.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
1232      0.0   0.0   0.0   1.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
1294      0.0   0.0   0.0   0.0   1.0   0.0   0.0   0.0   0.0   0.0  ...   
...       ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   
9776      0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
9782      0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
9809      0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
9909      0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
9956      0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   

user_id  93

In [3]:
# Step 5: Define the recommendation function
def recommend_papers(paper_id, cosine_sim_df, top_n=3):
    if paper_id not in cosine_sim_df.index:
        raise ValueError(f"Paper ID {paper_id} not found in the dataset.")
    
    # Get similarity scores for the specified paper
    sim_scores = cosine_sim_df.loc[paper_id]
    
    # Sort papers based on similarity scores
    sim_scores = sim_scores.sort_values(ascending=False)
    
    # Get the top N similar papers (excluding the input paper itself)
    recommended_papers = sim_scores.index[1:top_n + 1]  
    
    return recommended_papers.tolist()


In [5]:
recommended = recommend_papers(paper_id=1051, cosine_sim_df=cosine_sim_df)
print(f"Recommended papers for paper_id 1051: {recommended}")

Recommended papers for paper_id 1051: [7116, 6794, 6849]
