In [1]:
# Steps
#1.Data collection
#2.Data processing
#3.Model training

In [2]:
pip install pandas==1.3.5

In [3]:
# import libraries
import pandas as pd
import numpy as np

In [4]:
laptops = pd.read_csv("Dataset/laptops.csv")

In [5]:
laptops.head(3)

In [6]:
laptops.shape

In [7]:
laptops.info()

In [8]:
laptops = laptops[['laptop_id','img_link','name','price','processor','ram','os','storage','rating','laptop_brand','os_brand','processor_brand','usecases']]

In [9]:
laptops.head(3)

In [10]:
# MISSING VALUES

In [11]:
laptops.isnull().sum()

In [12]:
laptops.dropna(inplace=True)

In [13]:
laptops.duplicated().sum()

In [14]:
laptops.iloc[0].laptop_brand

In [15]:
"""
Work and Productivity
Education and Learning
Communication and Social Networking
Entertainment and Multimedia
Creative Work
"""

In [16]:
laptops.head(1)

In [17]:
laptops['processor'] = laptops['processor'].apply(lambda x:x.split())
laptops['ram'] = laptops['ram'].apply(lambda x:x.split())
laptops['os'] = laptops['os'].apply(lambda x:x.split())
laptops['use'] = laptops['usecases'].apply(lambda x:x.split())

In [18]:
laptops['tags'] = laptops['processor']+laptops['ram']+laptops['os']+laptops['use']

In [19]:
laptops.head(1)

In [20]:
laptops.iloc[0].tags

In [21]:
new = laptops.drop(columns=['processor','ram','os','storage','rating','os_brand','processor_brand','use'])

In [22]:
new.head(2)

In [23]:
new['tags'] = new['tags'].apply(lambda x:" ".join(x))

In [24]:
new.iloc[0].tags

In [25]:
new['tags'] = new['tags'].apply(lambda x:x.lower())

In [26]:
new.head(2)

In [27]:
new.iloc[0].tags

In [28]:
import nltk

In [29]:
from nltk.stem.porter import PorterStemmer

In [30]:
ps = PorterStemmer()

In [31]:
def stem(text):
    y=[]
    for i in text.split():
        y.append(ps.stem(i))
        
    return " ".join(y)

In [32]:
new['tags'] = new['tags'].apply(stem)

In [33]:
new.head(2)

In [34]:
# counter vectorizing

In [35]:
from sklearn.feature_extraction.text import CountVectorizer

In [36]:
cv = CountVectorizer(max_features = 5000, stop_words='english')

In [37]:
vector = cv.fit_transform(new['tags']).toarray()

In [38]:
vector

In [39]:
from sklearn.metrics.pairwise import cosine_similarity

In [40]:
similarity = cosine_similarity(vector)

In [41]:
similarity

In [42]:
similarity[0]

In [43]:
len(similarity[0])

In [44]:
#Recommend

In [45]:
def recommend(laptop):
    index = new[new['usecases']==laptop].index[0]
    distances = sorted(list(enumerate(similarity[index])), reverse=True, key = lambda x:x[1])
    for i in distances[1:6]:
        print(new.iloc[i[0]].usecases)

In [46]:
def recommend(use):
    index = new[new['usecases'] == use].index[0]
    distances = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda x: x[1])
    printed_names = set()
    for i in distances[1:50]:
        row_index = i[0]
        name = new.iloc[row_index]['name']
        if name not in printed_names:
            printed_names.add(name)
            print(name)

In [47]:
sorted(list(enumerate(similarity[0])), reverse=True, key = lambda x:x[1])

In [48]:
recommend("Work and Productivity, Entertainment and Multimedia, Creative Work")

In [49]:
import pickle

In [50]:
pickle.dump(new, open('model/usecases_list.pkl','wb'))

In [51]:
pickle.dump(similarity, open('model/similarity.pkl','wb'))