# Course Recommendation using K-Means

### Clustering defined

Clustering is the task of grouping together a set of objects in a way that objects in the same cluster are more similar to each other than to objects in other clusters. Similarity is a metric that reflects the strength of relationship between two data objects. 

In [1]:
# importing libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix,ConfusionMatrixDisplay
from sklearn.cluster import KMeans, AffinityPropagation
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

ModuleNotFoundError: No module named 'matplotlib'

In [None]:
coursesData = pd.read_csv(r"D:\pythonProject\final_courses.csv")
coursesData

In [None]:
coursesData.describe(include='all')

In [None]:
coursesData.info()

In [None]:
# Removing unwanted columns
coursesData = coursesData[['title', 'url', 'is_paid', 'course_by', 'skills']]
coursesData

In [None]:
# check for duplicates
coursesData.duplicated().sum()

In [None]:
# check for missing values
coursesData.isnull().sum()

In [None]:
# Converting skills column to list type
coursesData['skills'] = coursesData['skills'].str.split(',')
coursesData

In [None]:
coursesData = coursesData.explode('skills', ignore_index=True)
coursesData

In [None]:
coursesData['skills'].value_counts()

In [None]:
coursesDataEncoded = coursesData.copy()

### Label Encoding for Non numeric column 

In [None]:
features = ['title','url','is_paid','course_by','skills']
label_encoder = LabelEncoder()

for col in features:
    coursesDataEncoded[col] = label_encoder.fit_transform(coursesDataEncoded[col])
    # print(label_encoder.inverse_transform(coursesData[col])

coursesData

In [None]:
clusters_range = [2,3,4,5,6,7,8,9,10,11,12,13,14]
inertias =[]

for c in clusters_range:
    kmeans = KMeans(n_clusters=c, init='k-means++',random_state=0).fit(coursesDataEncoded)
    inertias.append(kmeans.inertia_)

plt.figure()
plt.plot(clusters_range,inertias, marker='o')
plt.title("Elbow Plot")
plt.xlabel("Clusters")
plt.ylabel("Inertias")

In [None]:
kmeans = KMeans(n_clusters=6, init='k-means++', random_state=0).fit(coursesDataEncoded)
kmeans

In [None]:
identifiedCluster = kmeans.fit_predict(coursesDataEncoded)
identifiedCluster = list(identifiedCluster)
identifiedCluster


In [None]:
import joblib
  
# Save the xgb_clf_tfidf model as a pickle in a file
joblib.dump(kmeans, r'D:\pythonProject\kmeansClustering.pkl')

In [None]:
coursesDataEncoded["cluster_predicted"] = identifiedCluster
coursesDataEncoded

In [None]:
coursesData["cluster_predicted"] = identifiedCluster
coursesData

In [None]:
coursesData.describe()

### Cluster Identification 

In [None]:
cluster_0 = coursesData[coursesData['cluster_predicted'] == 0]
cluster_1 = coursesData[coursesData['cluster_predicted'] == 1]
cluster_2 = coursesData[coursesData['cluster_predicted'] == 2]
cluster_3 = coursesData[coursesData['cluster_predicted'] == 3]
cluster_4 = coursesData[coursesData['cluster_predicted'] == 4]
cluster_5 = coursesData[coursesData['cluster_predicted'] == 5]

In [None]:
cluster_0.info()

In [None]:
cluster_1

In [None]:
cluster_1.info()

In [None]:
cluster_2.info()

In [None]:
cluster_3.info()

In [None]:
# # Load the model from the file
kmeans_model = joblib.load(r'D:\pythonProject\kmeansClustering.pkl')
kmeans_model


In [None]:
coursesData.to_csv(r"D:\pythonProject\coursesData.csv",header=True, index=False)
coursesDataEncoded.to_csv(r"D:\pythonProject\coursesDataEnc.csv",header=True, index=False)

In [None]:
courses_data = pd.read_csv(r'D:\pythonProject\coursesData.csv')
courses_data

In [None]:
courses_data['skills'] == 'Data Science'


In [None]:
courses_data[courses_data['skills'] == 'Data Science']

In [None]:
courses_data[courses_data['skills'] == 'Data Science'].cluster_predicted

In [None]:
type(courses_data[courses_data['skills'] == 'Data Science'].cluster_predicted)

In [None]:
cluster_no_list=courses_data[courses_data['skills'] == 'Data Science'].cluster_predicted.tolist()
cluster_no_list

In [None]:
cluster_no = courses_data[courses_data['skills'] == 'Data Science'].cluster_predicted
cluster_no

In [None]:
courses_data['cluster_predicted']==cluster_no_list[0]


In [None]:
courses_data[courses_data['cluster_predicted']==cluster_no_list[0]]