In [61]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [62]:
df = pd.read_csv("courses.csv")
df.head()

Unnamed: 0,Class Number,Class Name,Unique Number,Division,Days,Time,Location,Instructor,Flags,Core,Description
0,AAS,312 INTRO TO ASIAN AMERICAN HIST,32750,Lower,TTH,12:30 p.m.-2:00 p.m.,,,Cultural Diversity,U.S. History,
1,AAS,320D ASIAN AMERICAN MEDIA CULS,32755,Upper,TTH,2:00 p.m.-3:30 p.m.,CMA 3.120,"MALLAPRAGADA, MADHAVI",Cultural Diversity,,
2,AAS,325G HIST SE ASIAN DIASP IN US,32759,Upper,MWF,1:00 p.m.-2:00 p.m.,PAR 206,"GIBBS, KEVIN MASAKAZU",Cultural Diversity,U.S. History,
3,AAS,325J SOUTH ASIAN MIGRATION TO US,32760,Upper,TTH,12:30 p.m.-2:00 p.m.,WAG 420,"MEHTA, MOHIT P",Cultural Diversity,U.S. History,
4,AAS,325N ASIAN AMERICAN JURISPRUDENCE,32765,Upper,TTH,3:30 p.m.-5:00 p.m.,BIO 301,"JIN, ARNOLD R",Ethics Cultural Diversity,,


In [63]:
print("Shape of dataset:", df.shape)
print("Size of dataset:", df.size)

Shape of dataset: (5413, 11)
Size of dataset: 59543


In [64]:
df.isnull().sum()

Class Number        0
Class Name          0
Unique Number       0
Division            0
Days              450
Time              450
Location          837
Instructor        348
Flags            3458
Core             4676
Description      5413
dtype: int64

In [65]:
df.duplicated().any()

False

In [66]:
import neattext.functions as nfx

In [67]:
df["Class Name"]

0        312 INTRO TO ASIAN AMERICAN HIST
1          320D ASIAN AMERICAN MEDIA CULS
2          325G HIST SE ASIAN DIASP IN US
3        325J SOUTH ASIAN MIGRATION TO US
4       325N ASIAN AMERICAN JURISPRUDENCE
                      ...                
5408         679HB HONORS TUTORIAL COURSE
5409               379L INTERNSHIP IN WGS
5410                  379S SENIOR SEMINAR
5411    612 ACCELERATED SECOND-YR YIDDISH
5412             611C INTERMEDIATE YORUBA
Name: Class Name, Length: 5413, dtype: object

In [68]:
from sklearn.feature_extraction.text import CountVectorizer

In [69]:
cv = CountVectorizer(max_features=3000)
vectors = cv.fit_transform(df["Class Name"]).toarray()

In [70]:
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [71]:
df.shape

(5413, 11)

In [72]:
len(cv.get_feature_names_out())

3000

In [73]:
from sklearn.metrics.pairwise import cosine_similarity 

In [74]:
similarity = cosine_similarity(vectors)

In [75]:
similarity

array([[1.        , 0.36514837, 0.33333333, ..., 0.        , 0.        ,
        0.        ],
       [0.36514837, 1.        , 0.18257419, ..., 0.        , 0.        ,
        0.        ],
       [0.33333333, 0.18257419, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [76]:
sorted(enumerate(similarity[0]), reverse=True, key=lambda x:x[1])

[(0, 1.0000000000000002),
 (2744, 0.8333333333333336),
 (243, 0.5477225575051662),
 (244, 0.5477225575051662),
 (2733, 0.5477225575051662),
 (2734, 0.5477225575051662),
 (3965, 0.5477225575051662),
 (177, 0.5000000000000001),
 (2743, 0.5000000000000001),
 (2497, 0.4714045207910318),
 (4, 0.4082482904638631),
 (262, 0.4082482904638631),
 (840, 0.4082482904638631),
 (1174, 0.4082482904638631),
 (1175, 0.4082482904638631),
 (1176, 0.4082482904638631),
 (1177, 0.4082482904638631),
 (1944, 0.4082482904638631),
 (2102, 0.4082482904638631),
 (2498, 0.4082482904638631),
 (2968, 0.4082482904638631),
 (3096, 0.4082482904638631),
 (4290, 0.4082482904638631),
 (4499, 0.4082482904638631),
 (1, 0.36514837167011077),
 (89, 0.36514837167011077),
 (90, 0.36514837167011077),
 (91, 0.36514837167011077),
 (92, 0.36514837167011077),
 (93, 0.36514837167011077),
 (94, 0.36514837167011077),
 (95, 0.36514837167011077),
 (175, 0.36514837167011077),
 (289, 0.36514837167011077),
 (701, 0.36514837167011077),
 (729

In [77]:
def content_recomm(course):
    course_index = df[df['Class Name']==course].index[0]
    sim = similarity[course_index]
    course_list = sorted(enumerate(similarity[0]), reverse=True, key=lambda x:x[1])[1:6]
    for i in course_list:
        print(df.iloc[i[0]]['Class Name'])

In [78]:
list = content_recomm('312 INTRO TO ASIAN AMERICAN HIST')


317L 4-INTRO TO ASIAN AMERICAN HIST
310 INTRO TO AMERICAN STUDIES
310 INTRO TO AMERICAN STUDIES
315G INTRO TO AMERICAN STUDIES
315G INTRO TO AMERICAN STUDIES
