# Recommeder system using TF-IDF

## Necessary imports

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

import ast 
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

import warnings; warnings.simplefilter('ignore')

## Exploratory data analysis


### Loading dataset

In [None]:
path = '/content/rankings.csv'

In [None]:
rankings = pd.read_csv(path, encoding='utf-8')

In [None]:
rankings.head()

Unnamed: 0,OS,Budget,Size,Camera Importance,Usage Hours (Battery),Speed Importance,top,second,third,confidence,difficulty
0,Must be Android,400.0,Large,A little,A little,A lot,Motorola Moto G100,Motorola Edge (2021),Samsung Galaxy A42,OK,Medium
1,Must be Android,400.0,Large,A ton,A little,I need speed,Google Pixel 6,OnePlus 9,Samsung Galaxy S20 FE,OK,Hard
2,Must be Android,400.0,Medium,Not sure,Not sure,I need speed,OnePlus 8T,Motorola Moto G100,OnePlus 9,OK,Medium
3,Must be Android,400.0,No preference,Somewhat,A little,A little,Google Pixel 5a,Samsung Galaxy A53,Samsung Galaxy A42,Good,Medium
4,Must be Android,400.0,No preference,A ton,A lot,A lot,Motorola Moto G100,Samsung Galaxy A42,Samsung Galaxy A53,Good,Medium


In [None]:
rankings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 177 entries, 0 to 176
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   OS                     177 non-null    object
 1   Budget                 177 non-null    object
 2   Size                   177 non-null    object
 3   Camera Importance      177 non-null    object
 4   Usage Hours (Battery)  177 non-null    object
 5   Speed Importance       177 non-null    object
 6   top                    177 non-null    object
 7   second                 177 non-null    object
 8   third                  177 non-null    object
 9   confidence             177 non-null    object
 10  difficulty             177 non-null    object
dtypes: object(11)
memory usage: 15.3+ KB


### Droping last two columns

In [None]:
rankings = rankings.drop(['confidence', 'difficulty'], axis=1)

In [None]:
rankings.head()

Unnamed: 0,OS,Budget,Size,Camera Importance,Usage Hours (Battery),Speed Importance,top,second,third
0,Must be Android,400.0,Large,A little,A little,A lot,Motorola Moto G100,Motorola Edge (2021),Samsung Galaxy A42
1,Must be Android,400.0,Large,A ton,A little,I need speed,Google Pixel 6,OnePlus 9,Samsung Galaxy S20 FE
2,Must be Android,400.0,Medium,Not sure,Not sure,I need speed,OnePlus 8T,Motorola Moto G100,OnePlus 9
3,Must be Android,400.0,No preference,Somewhat,A little,A little,Google Pixel 5a,Samsung Galaxy A53,Samsung Galaxy A42
4,Must be Android,400.0,No preference,A ton,A lot,A lot,Motorola Moto G100,Samsung Galaxy A42,Samsung Galaxy A53


### Checking dataset grouping by columns

#### Budget

In [None]:
for key in rankings.groupby(['Budget']).groups:
  print("Total {} in {} group".format(len(rankings.groupby(['Budget']).groups[key]), key))

Total 38 in 400.00 group
Total 24 in 600.00 group
Total 41 in 800.00 group
Total 74 in Not sure group


#### Size

In [None]:
for key in rankings.groupby(['Size']).groups:
  print("Total {} in {} group".format(len(rankings.groupby(['Size']).groups[key]), key))

Total 30 in Large group
Total 29 in Medium group
Total 66 in No preference group
Total 52 in Small group


#### Camera Importance

In [None]:
for key in rankings.groupby(['Camera Importance']).groups:
  print("Total {} in {} group".format(len(rankings.groupby(['Camera Importance']).groups[key]), key))

Total 38 in A little group
Total 38 in A lot group
Total 38 in A ton group
Total 29 in Not sure group
Total 34 in Somewhat group


#### Usage Hours (Battery)

In [None]:
for key in rankings.groupby(['Usage Hours (Battery)']).groups:
  print("Total {} in {} group".format(len(rankings.groupby(['Usage Hours (Battery)']).groups[key]), key))

Total 36 in A little group
Total 32 in A lot group
Total 30 in A ton group
Total 79 in Not sure group


#### Speed Importance

In [None]:
for key in rankings.groupby(['Speed Importance']).groups:
  print("Total {} in {} group".format(len(rankings.groupby(['Speed Importance']).groups[key]), key))

Total 68 in A little group
Total 33 in A lot group
Total 37 in I need speed group
Total 39 in Not sure group


## Building the Recommendation System

### Creating a new dataframe 

In [None]:
df = rankings.iloc[0:, 0:]

In [None]:
df.head()

Unnamed: 0,OS,Budget,Size,Camera Importance,Usage Hours (Battery),Speed Importance,top,second,third
0,Must be Android,400.0,Large,A little,A little,A lot,Motorola Moto G100,Motorola Edge (2021),Samsung Galaxy A42
1,Must be Android,400.0,Large,A ton,A little,I need speed,Google Pixel 6,OnePlus 9,Samsung Galaxy S20 FE
2,Must be Android,400.0,Medium,Not sure,Not sure,I need speed,OnePlus 8T,Motorola Moto G100,OnePlus 9
3,Must be Android,400.0,No preference,Somewhat,A little,A little,Google Pixel 5a,Samsung Galaxy A53,Samsung Galaxy A42
4,Must be Android,400.0,No preference,A ton,A lot,A lot,Motorola Moto G100,Samsung Galaxy A42,Samsung Galaxy A53


### Clearning up white spaces and converting all characters to lower case
If OS is named *Must be Android* and another is called *Must be Iphone*, the vectorizer will count the word *Must be* in both cases, and the recommender system might consider the phones as highly similar, even though they are not related at all. So we will remove whitespace from the columns

In [None]:
def clean_text(OS):
    result = str(OS).lower()
    return(result.replace(' ',''))

df['OS'] = df['OS'].apply(clean_text)

def clean_text(top):
    result = str(top).lower()
    return(result.replace(' ',''))

df['top'] = df['top'].apply(clean_text)

def clean_text(second):
    result = str(second).lower()
    return(result.replace(' ',''))

df['second'] = df['second'].apply(clean_text)

def clean_text(third):
    result = str(third).lower()
    return(result.replace(' ',''))

df['third'] = df['third'].apply(clean_text)

In [None]:
df.head()

Unnamed: 0,OS,Budget,Size,Camera Importance,Usage Hours (Battery),Speed Importance,top,second,third
0,mustbeandroid,400.0,Large,A little,A little,A lot,motorolamotog100,motorolaedge(2021),samsunggalaxya42
1,mustbeandroid,400.0,Large,A ton,A little,I need speed,googlepixel6,oneplus9,samsunggalaxys20fe
2,mustbeandroid,400.0,Medium,Not sure,Not sure,I need speed,oneplus8t,motorolamotog100,oneplus9
3,mustbeandroid,400.0,No preference,Somewhat,A little,A little,googlepixel5a,samsunggalaxya53,samsunggalaxya42
4,mustbeandroid,400.0,No preference,A ton,A lot,A lot,motorolamotog100,samsunggalaxya42,samsunggalaxya53


### let’s combine these columns to create train and test variable

In [None]:
df["train"] = df['OS'] + df['Budget'] + df['Size'] + df['Camera Importance'] + df['Usage Hours (Battery)'] + df['Speed Importance'] + df['top'] + df['second'] + df['third']

In [None]:
df["test"] = df['OS'] + df['Budget'] + df['Size'] + df['Camera Importance'] + df['Usage Hours (Battery)'] + df['Speed Importance']

In [None]:
df.head()

Unnamed: 0,OS,Budget,Size,Camera Importance,Usage Hours (Battery),Speed Importance,top,second,third,train,test
0,mustbeandroid,400.0,Large,A little,A little,A lot,motorolamotog100,motorolaedge(2021),samsunggalaxya42,mustbeandroid400.00LargeA littleA littleA lotm...,mustbeandroid400.00LargeA littleA littleA lot
1,mustbeandroid,400.0,Large,A ton,A little,I need speed,googlepixel6,oneplus9,samsunggalaxys20fe,mustbeandroid400.00LargeA tonA littleI need sp...,mustbeandroid400.00LargeA tonA littleI need speed
2,mustbeandroid,400.0,Medium,Not sure,Not sure,I need speed,oneplus8t,motorolamotog100,oneplus9,mustbeandroid400.00MediumNot sureNot sureI nee...,mustbeandroid400.00MediumNot sureNot sureI nee...
3,mustbeandroid,400.0,No preference,Somewhat,A little,A little,googlepixel5a,samsunggalaxya53,samsunggalaxya42,mustbeandroid400.00No preferenceSomewhatA litt...,mustbeandroid400.00No preferenceSomewhatA litt...
4,mustbeandroid,400.0,No preference,A ton,A lot,A lot,motorolamotog100,samsunggalaxya42,samsunggalaxya53,mustbeandroid400.00No preferenceA tonA lotA lo...,mustbeandroid400.00No preferenceA tonA lotA lot


### Vectorize the Dataframe with TF-IDF

In [None]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df["train"])

In [None]:
tfidf_matrix.shape

(177, 640)

### applying cosine similarity on the vector

In [None]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
similarities = pd.DataFrame(cosine_sim, columns=df['test'], index=df['test']).reset_index()
similarities.head()

test,test.1,mustbeandroid400.00LargeA littleA littleA lot,mustbeandroid400.00LargeA tonA littleI need speed,mustbeandroid400.00MediumNot sureNot sureI need speed,mustbeandroid400.00No preferenceSomewhatA littleA little,mustbeandroid400.00No preferenceA tonA lotA lot,mustbeandroid400.00No preferenceNot sureA lotA lot,mustbeandroid400.00SmallA tonA tonNot sure,mustbeandroid400.00SmallSomewhatNot sureI need speed,mustbeandroid600.00LargeA littleA littleNot sure,...,preferiphoneNot sureSmallNot sureA littleI need speed,preferiphoneNot sureMediumA lotNot sureI need speed,preferiphoneNot sureNo preferenceA littleA tonA lot,preferiphoneNot sureNo preferenceA lotA littleA lot,preferiphoneNot sureNo preferenceA littleA lotI need speed,preferiphoneNot sureNo preferenceA littleNot sureA lot,preferiphoneNot sureNo preferenceNot sureNot sureA little,preferiphoneNot sureSmallA littleNot sureNot sure,preferiphoneNot sureSmallA lotA lotA little,preferiphoneNot sureSmallNot sureNot sureNot sure
0,mustbeandroid400.00LargeA littleA littleA lot,1.0,0.226972,0.065592,0.129148,0.070813,0.139533,0.069863,0.067549,0.226691,...,0.0,0.0,0.063986,0.062774,0.055823,0.0,0.0,0.0,0.0,0.0
1,mustbeandroid400.00LargeA tonA littleI need speed,0.226972,1.0,0.111733,0.083619,0.119917,0.07288,0.118308,0.115067,0.075489,...,0.152699,0.031741,0.03559,0.0,0.031693,0.0,0.0,0.0,0.0,0.0
2,mustbeandroid400.00MediumNot sureNot sureI nee...,0.065592,0.111733,1.0,0.086396,0.088536,0.075301,0.087348,0.513079,0.0,...,0.027926,0.146415,0.0,0.0,0.032746,0.0,0.036992,0.036528,0.0,0.08122
3,mustbeandroid400.00No preferenceSomewhatA litt...,0.129148,0.083619,0.086396,1.0,0.266401,0.226576,0.092021,0.088974,0.037667,...,0.0,0.0,0.04214,0.041342,0.036764,0.0,0.0,0.0,0.0,0.0
4,mustbeandroid400.00No preferenceA tonA lotA lot,0.070813,0.119917,0.088536,0.266401,1.0,0.265331,0.131966,0.091178,0.0,...,0.0,0.0,0.077096,0.076443,0.032626,0.039309,0.0,0.0,0.068957,0.0


## Displaying User Recommendations

In [None]:
df =  df.reset_index()
titles = df['test']
indices = pd.Series(df.index, index=df['test'])

In [None]:
def get_recommendations(user_input):
    user_input[0] = user_input[0].replace(" ", "").lower()

    title = ""
    for ele in user_input:
        title += ele


    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    rank_indices = [i[0] for i in sim_scores]
    return rank_indices[:3]


### Give user input

In [None]:
user_input = ['Must be android', '400.00', 'Large', 'A ton', 'A little', 'I need speed']

### predict_top_3_ranking function

In [None]:
def predict_top_3_ranking(user_input):
  df1 = get_recommendations(user_input)
  print("best recommendation: \t  \t", df.loc[df1[0]]['top'], "\t", df.loc[df1[0]]['second'],"\t", df.loc[df1[0]]['third'])
  print("second best recommendation:  \t", df.loc[df1[1]]['top'],"\t", df.loc[df1[2]]['second'],"\t", df.loc[df1[1]]['third'])
  print("Third best recommendation:  \t", df.loc[df1[1]]['top'], "\t",df.loc[df1[2]]['second'], "\t",df.loc[df1[1]]['third'])

predict_top_3_ranking(user_input)

best recommendation: 	  	 googlepixel6 	 oneplus9 	 samsunggalaxys20fe
second best recommendation:  	 googlepixel6 	 motorolaedge(2021) 	 appleiphone12mini
Third best recommendation:  	 googlepixel6 	 motorolaedge(2021) 	 appleiphone12mini


## Drawbacks: 
the main drawback of content-based filtering was that similar items would be grouped together, and users will not be recommended products with content that is not in the training set. As it will generate a keyword error
