# Content Based Filtering : Kampus Merdeka

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import re
import random

df = pd.read_csv("dataset_tugas.csv")
df.head()

Unnamed: 0,company,address,description
0,Alfaruq Marsalsani Supriyatno,"Ngaliyan, Semarang, Jawa Tengah, 50187","Programming Algorithm, Programming Language, D..."
1,PT Neosia Pratama Indonusa,"Sona Topas Tower, lt. 5A/526. Jl. Jendral Sudi...",Disciplined BIM Modeling Structure BIM Modelin...
2,PT Neosia Pratama Indonusa,"Sona Topas Tower, lt. 5A/526. Jl. Jendral Sudi...",Disciplined BIM Modeling Structure BIM Modelin...
3,PT Mitra Semeru Indonesia,"Jl. Mampang Prapatan Raya no 84A, Tegal Parang...","03.Branding on Logo, Tagline, Brand Value & Br..."
4,Perkumpulan Sosial Ekonomi Akselerator Lab,"Kawasan Ekonomi Khusus Singhasari, Raya Klampo...",AWS Cloud Computing Concept of Digital Transfo...


## **1. Ikhtisar**

In [2]:
df.describe()

Unnamed: 0,company,address,description
count,20,20,20
unique,15,15,19
top,PT. IlmuKomputerCom Braindevs Sistema,"Menara Bidakara 1 Lantai 2, Jl. Gatot Subroto...",Disciplined BIM Modeling Structure BIM Modelin...
freq,3,3,2


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   company      20 non-null     object
 1   address      20 non-null     object
 2   description  20 non-null     object
dtypes: object(3)
memory usage: 608.0+ bytes


## **2. Deskripsi Kompetensi (Sebelum Preprocessing)**

In [6]:
def print_description(index):
    example = df[df.index == index][['description', 'company', 'address']].values[0]
    if len(example) > 0:
        print(example[0])
        print('Nama Perusahaan\t:', example[1])
        print('Alamat\t \t:', example[2])

In [7]:
print_description(1)

Disciplined BIM Modeling Structure BIM Modeling Disciplined Architecture Workflow Building and Infrastructure Modeling Presentation and Visualization Project Workflow analysis and team collaboration on BIM Modeling Projects Structure Analysis BIM Modeling Mechanical Disciplines BIM Modeling Electrical Disciplines BIM Modeling Generative Components Project Digital Collaboration AEC
Nama Perusahaan	: PT Neosia Pratama Indonusa
Alamat	 	: Sona Topas Tower, lt. 5A/526. Jl. Jendral Sudirman, KAV 26. Karet Kuningan, Setiabudi. Jakarta Selatan. DKI Jakarta


In [10]:
print_description(7)

Systems Analysis and Design Communication Skills Team Work Time Management Software Engineering
Nama Perusahaan	: PT. IlmuKomputerCom Braindevs Sistema
Alamat	 	: Menara Bidakara 1  Lantai 2, Jl. Gatot Subroto Kav. 71-73 Pancoran Jakarta Selatan 12870


In [11]:
print_description(18)

Design Thinking Visual Branding Video tutorial dan hands on production Motion Comic production Desain Media Interaktif
Nama Perusahaan	: PT Sebangku Jaya Abadi
Alamat	 	: Yogyakarta


## **3. Text Preprocessing**

In [13]:
clean_spcl = re.compile('[/(){}\[\]\|@,;]')
clean_symbol = re.compile('[^0-9a-z #+_]')
stopworda = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = clean_spcl.sub(' ', text)
    text = clean_symbol.sub('', text)
    text = ' '.join(word for word in text.split() if word not in stopworda) # hapus stopword dari kolom deskripsi
    return text
    
df['desc_clean'] = df['description'].apply(clean_text)

In [14]:
df.head()

Unnamed: 0,company,address,description,desc_clean
0,Alfaruq Marsalsani Supriyatno,"Ngaliyan, Semarang, Jawa Tengah, 50187","Programming Algorithm, Programming Language, D...",programming algorithm programming language dig...
1,PT Neosia Pratama Indonusa,"Sona Topas Tower, lt. 5A/526. Jl. Jendral Sudi...",Disciplined BIM Modeling Structure BIM Modelin...,disciplined bim modeling structure bim modelin...
2,PT Neosia Pratama Indonusa,"Sona Topas Tower, lt. 5A/526. Jl. Jendral Sudi...",Disciplined BIM Modeling Structure BIM Modelin...,disciplined bim modeling structure bim modelin...
3,PT Mitra Semeru Indonesia,"Jl. Mampang Prapatan Raya no 84A, Tegal Parang...","03.Branding on Logo, Tagline, Brand Value & Br...",03branding logo tagline brand value brand stor...
4,Perkumpulan Sosial Ekonomi Akselerator Lab,"Kawasan Ekonomi Khusus Singhasari, Raya Klampo...",AWS Cloud Computing Concept of Digital Transfo...,aws cloud computing concept digital transforma...


## **4. Deskripsi Kompetensi (Setelah Preprocessing)**

In [17]:
# Deskripsi kedua (Setelah preprocessing)
def print_description_clean(index):
    example = df[df.index == index][['desc_clean', 'company', 'address']].values[0]
    if len(example) > 0:
        print(example[0])
        print('Nama:', example[1])
        print('Alamat:', example[2])

In [18]:
print_description_clean(1)

disciplined bim modeling structure bim modeling disciplined architecture workflow building infrastructure modeling presentation visualization project workflow analysis team collaboration bim modeling projects structure analysis bim modeling mechanical disciplines bim modeling electrical disciplines bim modeling generative components project digital collaboration aec
Nama: PT Neosia Pratama Indonusa
Alamat: Sona Topas Tower, lt. 5A/526. Jl. Jendral Sudirman, KAV 26. Karet Kuningan, Setiabudi. Jakarta Selatan. DKI Jakarta


In [19]:
print_description_clean(7)

systems analysis design communication skills team work time management software engineering
Nama: PT. IlmuKomputerCom Braindevs Sistema
Alamat: Menara Bidakara 1  Lantai 2, Jl. Gatot Subroto Kav. 71-73 Pancoran Jakarta Selatan 12870


In [20]:
print_description(18)

Design Thinking Visual Branding Video tutorial dan hands on production Motion Comic production Desain Media Interaktif
Nama Perusahaan	: PT Sebangku Jaya Abadi
Alamat	 	: Yogyakarta


## **5. TF-IDF & Cosine Similarity**

In [21]:
df.set_index('company', inplace=True)
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df['desc_clean'])
cos_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cos_sim

array([[1.        , 0.00985299, 0.00985299, 0.00152253, 0.05583686,
        0.01881026, 0.02359147, 0.03333778, 0.03333778, 0.10319529,
        0.        , 0.        , 0.08101491, 0.02142874, 0.01915351,
        0.09567853, 0.        , 0.0071368 , 0.00331359, 0.01355846],
       [0.00985299, 1.        , 1.        , 0.        , 0.00751428,
        0.        , 0.00354891, 0.01832409, 0.01832409, 0.00986682,
        0.00805488, 0.        , 0.01244081, 0.        , 0.02041834,
        0.00921177, 0.        , 0.02476587, 0.        , 0.        ],
       [0.00985299, 1.        , 1.        , 0.        , 0.00751428,
        0.        , 0.00354891, 0.01832409, 0.01832409, 0.00986682,
        0.00805488, 0.        , 0.01244081, 0.        , 0.02041834,
        0.00921177, 0.        , 0.02476587, 0.        , 0.        ],
       [0.00152253, 0.        , 0.        , 1.        , 0.0102339 ,
        0.01625925, 0.01226297, 0.00424957, 0.00424957, 0.00761102,
        0.        , 0.05404043, 0.00598801, 0

In [22]:
# Set index utama di kolom 'company'
indices = pd.Series(df.index)
indices[:50]

0                  Alfaruq Marsalsani Supriyatno
1                     PT Neosia Pratama Indonusa
2                     PT Neosia Pratama Indonusa
3                      PT Mitra Semeru Indonesia
4     Perkumpulan Sosial Ekonomi Akselerator Lab
5                      PT Mitra Semeru Indonesia
6                  PT STECHOQ ROBOTIKA INDONESIA
7          PT. IlmuKomputerCom Braindevs Sistema
8          PT. IlmuKomputerCom Braindevs Sistema
9          PT. IlmuKomputerCom Braindevs Sistema
10                 PT Hashmicro Solusi Indonesia
11                 Yayasan Adipurna Inovasi Asia
12               PT Nodeflux Teknologi Indonesia
13                    PT Widya Inovasi Indonesia
14                          PT Pundi Mas Berjaya
15                  PT Disty Teknologi Indonesia
16                       PT Uni Tokopo Teknologi
17                           UPT Solo Technopark
18                        PT Sebangku Jaya Abadi
19                    PT Widya Inovasi Indonesia
Name: company, dtype

## **6. Modelling**

In [55]:
def recommendations(name, cos_sim = cos_sim):
    
    recommended_comp = []
    
    # Mengambil nama kompetensi berdasarkan variabel indicies
    idx = indices[indices == name].index[0]

    # Membuat series berdasarkan skor kesamaan
    score_series = pd.Series(cos_sim[idx]).sort_values(ascending = False)

    # mengambil index dan dibuat 10 baris rekomendasi terbaik
    top_10_indexes = list(score_series.iloc[1:11].index)

    print("\nREKOMENDASI PERUSAHAAN PROGRAM KAMPUS MERDEKA\n")
    print("="*60)
    
    for i in top_10_indexes:
        recommended_comp.append(list(df.index)[i])
        print()
        print('NAMA PERUSAHAAN\t:', list(df.index)[i])
        print('ALAMAT\t\t:', list(df['address'])[i])
        print()
        print("="*60)

## **7. Prediksi**

In [56]:
recommendations('Alfaruq Marsalsani Supriyatno')


REKOMENDASI PERUSAHAAN PROGRAM KAMPUS MERDEKA


NAMA PERUSAHAAN	: PT. IlmuKomputerCom Braindevs Sistema
ALAMAT		: Menara Bidakara 1  Lantai 2, Jl. Gatot Subroto Kav. 71-73 Pancoran Jakarta Selatan 12870


NAMA PERUSAHAAN	: PT Disty Teknologi Indonesia
ALAMAT		: Gedung Graha Pena Lantai 5 Jl. A.Yani No. 88 Surabaya


NAMA PERUSAHAAN	: PT Nodeflux Teknologi Indonesia
ALAMAT		: Jakarta Indonesia


NAMA PERUSAHAAN	: Perkumpulan Sosial Ekonomi Akselerator Lab
ALAMAT		: Kawasan Ekonomi Khusus Singhasari, Raya Klampok Blok D2 no 16 Purwoasri, Singosari, Kab. Malang


NAMA PERUSAHAAN	: PT. IlmuKomputerCom Braindevs Sistema
ALAMAT		: Menara Bidakara 1  Lantai 2, Jl. Gatot Subroto Kav. 71-73 Pancoran Jakarta Selatan 12870


NAMA PERUSAHAAN	: PT. IlmuKomputerCom Braindevs Sistema
ALAMAT		: Menara Bidakara 1  Lantai 2, Jl. Gatot Subroto Kav. 71-73 Pancoran Jakarta Selatan 12870


NAMA PERUSAHAAN	: PT STECHOQ ROBOTIKA INDONESIA
ALAMAT		: Jalan Belimbing A-17 Perum Sidoarum Blok II, Godean, Sleman,

# Selesai...