# Bangkit Capstone Project : Funders

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/drive/1gPsSLT7i-QIZse9PRx0O7v13qewcyeSD?usp=sharing"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/bagussatya/ml-funders/blob/main/Capstone_Notebook.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View source on GitHub</a>
  </td>
</table>

# Dependencies & Libraries

In [None]:
#Libraries

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

#Visualization
import pandas as pd

## Data Preparation

In [None]:
def get_data(link):
  df = pd.read_json(link)
  df = pd.DataFrame(df['data'].values.tolist())
  return df

In [None]:
#Importing Enterpreneurs data from database
pengusaha_df = get_data('http://35.197.140.41/api/auth/get_all_pengusaha')
pengusaha_df.head(3)

Unnamed: 0,id,name,email,alamat,no_telp,status,provinsi,kabkota,kategori,lokasi,range_fund,range_employee
0,1,Usaha Daur Ulang kertas Semen,110pengusaha@capstone.com,,89787890879,Aktif,SUMATERA UTARA,KAB. DELI SERDANG,Industri Pengolahan,SUMATERA UTARA,> 8 Juta,< 10 Orang
1,2,Usaha Nyiru Dg Kanang,111pengusaha@capstone.com,,89787890880,Aktif,SULAWESI SELATAN,KAB. GOWA,Industri Pengolahan,SULAWESI SELATAN,> 8 Juta,< 10 Orang
2,3,Ira Furniture,112pengusaha@capstone.com,,89787890881,Aktif,DKI JAKARTA,KOTA ADM. JAKARTA TIMUR,Industri Pengolahan,DKI JAKARTA,> 8 Juta,< 10 Orang


In [62]:
#Importing the Investor profile data from database
investor_df = get_data('http://35.197.140.41/api/auth/get_all_investor')
investor_df = investor_df.iloc[-1:]
investor_df

Unnamed: 0,id,name,email,alamat,no_telp,status,provinsi,kabkota,kategori,lokasi,range_fund,range_employee
2,557,Yestika Dian Wulandari,wulan@capstone.com,,89787891435,Aktif,KALIMANTAN TIMUR,KOTA SAMARINDA,"Kesenian, Hiburan Dan Rekreasi",KALIMANTAN TIMUR,> 8 Juta,10 - 30 Orang


In [63]:
# Function to clean whitespaces in columns values
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [64]:
# Function to soup all the features
def create_soup(x):
    return x['kategori'] + ' ' + str(x['range_fund']) + ' ' + str(x['range_employee']) + ' ' + x['lokasi']

In [65]:
# Set the features
features = ['kategori', 'range_fund', 'range_employee', 'lokasi']

In [66]:
# Combining the enterpreneurs and choosen investor data to count similarity
df_train = pengusaha_df.append(investor_df,ignore_index=True)
df_train

Unnamed: 0,id,name,email,alamat,no_telp,status,provinsi,kabkota,kategori,lokasi,range_fund,range_employee
0,1,Usaha Daur Ulang kertas Semen,110pengusaha@capstone.com,,89787890879,Aktif,SUMATERA UTARA,KAB. DELI SERDANG,Industri Pengolahan,SUMATERA UTARA,> 8 Juta,< 10 Orang
1,2,Usaha Nyiru Dg Kanang,111pengusaha@capstone.com,,89787890880,Aktif,SULAWESI SELATAN,KAB. GOWA,Industri Pengolahan,SULAWESI SELATAN,> 8 Juta,< 10 Orang
2,3,Ira Furniture,112pengusaha@capstone.com,,89787890881,Aktif,DKI JAKARTA,KOTA ADM. JAKARTA TIMUR,Industri Pengolahan,DKI JAKARTA,> 8 Juta,< 10 Orang
3,4,Usaha Tahu Suparni,113pengusaha@capstone.com,,89787890882,Aktif,JAWA TIMUR,KAB. SIDOARJO,Industri Pengolahan,JAWA TIMUR,> 8 Juta,< 10 Orang
4,5,MARJAN BUEN,114pengusaha@capstone.com,,89787890883,Aktif,NUSA TENGGARA BARAT,KAB. SUMBAWA,Kegiatan Jasa Lainnya,NUSA TENGGARA BARAT,> 8 Juta,< 10 Orang
...,...,...,...,...,...,...,...,...,...,...,...,...
550,551,MEUBEL GARUDA,660pengusaha@capstone.com,,89787891429,Aktif,KALIMANTAN TENGAH,KAB. KOTAWARINGIN BARAT,Industri Pengolahan,KALIMANTAN TENGAH,> 8 Juta,< 10 Orang
551,552,Usaha Kue Frederika,661pengusaha@capstone.com,,89787891430,Aktif,MALUKU,KOTA AMBON,Industri Pengolahan,MALUKU,> 8 Juta,< 10 Orang
552,553,Dandang Royana,662pengusaha@capstone.com,,89787891431,Aktif,SUMATERA SELATAN,KOTA PRABUMULIH,Industri Pengolahan,SUMATERA SELATAN,4 - 8 Juta,< 10 Orang
553,554,Isnaniah,663pengusaha@capstone.com,,89787891432,Aktif,KALIMANTAN SELATAN,KAB. BANJAR,Industri Pengolahan,KALIMANTAN SELATAN,< 4 Juta,< 10 Orang


In [67]:
# Applying the clean_data function to features
for feature in features:
    df_train[feature] = df_train[feature].apply(clean_data)

In [68]:
df_train.head(5)

Unnamed: 0,id,name,email,alamat,no_telp,status,provinsi,kabkota,kategori,lokasi,range_fund,range_employee
0,1,Usaha Daur Ulang kertas Semen,110pengusaha@capstone.com,,89787890879,Aktif,SUMATERA UTARA,KAB. DELI SERDANG,industripengolahan,sumaterautara,>8juta,<10orang
1,2,Usaha Nyiru Dg Kanang,111pengusaha@capstone.com,,89787890880,Aktif,SULAWESI SELATAN,KAB. GOWA,industripengolahan,sulawesiselatan,>8juta,<10orang
2,3,Ira Furniture,112pengusaha@capstone.com,,89787890881,Aktif,DKI JAKARTA,KOTA ADM. JAKARTA TIMUR,industripengolahan,dkijakarta,>8juta,<10orang
3,4,Usaha Tahu Suparni,113pengusaha@capstone.com,,89787890882,Aktif,JAWA TIMUR,KAB. SIDOARJO,industripengolahan,jawatimur,>8juta,<10orang
4,5,MARJAN BUEN,114pengusaha@capstone.com,,89787890883,Aktif,NUSA TENGGARA BARAT,KAB. SUMBAWA,kegiatanjasalainnya,nusatenggarabarat,>8juta,<10orang


In [69]:
df_train['soup'] = df_train.apply(create_soup, axis=1)
df_train['soup']=df_train['soup'].apply(str)

In [70]:
df_train.tail(3)

Unnamed: 0,id,name,email,alamat,no_telp,status,provinsi,kabkota,kategori,lokasi,range_fund,range_employee,soup
552,553,Dandang Royana,662pengusaha@capstone.com,,89787891431,Aktif,SUMATERA SELATAN,KOTA PRABUMULIH,industripengolahan,sumateraselatan,4-8juta,<10orang,industripengolahan 4-8juta <10orang sumaterase...
553,554,Isnaniah,663pengusaha@capstone.com,,89787891432,Aktif,KALIMANTAN SELATAN,KAB. BANJAR,industripengolahan,kalimantanselatan,<4juta,<10orang,industripengolahan <4juta <10orang kalimantans...
554,557,Yestika Dian Wulandari,wulan@capstone.com,,89787891435,Aktif,KALIMANTAN TIMUR,KOTA SAMARINDA,"kesenian,hiburandanrekreasi",kalimantantimur,>8juta,10-30orang,"kesenian,hiburandanrekreasi >8juta 10-30orang ..."


# Calculating the similarity

In [71]:
#calculate the similarity

count = CountVectorizer()
count_matrix = count.fit_transform(df_train['soup'])
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [72]:
#Getting the index
indices = pd.Series(df_train.index, index=df_train['id'])

#Getting the 10 most matched with user

In [73]:
def get_recommendations(id, cosine_sim):
    # Get the index of the user_name
    idx = indices[id]

    # Get the pairwsie similarity scores of all enterpreneurs by the investor
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the enterpreneurs based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most match items with user
    sim_scores = sim_scores[1:11]

    # Get the item indices
    umkm_id = [i[0] for i in sim_scores]

    # Return the top 10 most similar items
    result_df = pengusaha_df[['id']].copy()
    return result_df.iloc[umkm_id, :]

In [74]:
result = get_recommendations(investor_df['id'].item(), cosine_sim2)
result.columns = ['id_recommended']
result.insert(0, 'id_user', investor_df['id'].item())

In [75]:
result

Unnamed: 0,id_user,id_recommended
11,557,12
41,557,42
44,557,45
49,557,50
155,557,156
250,557,251
37,557,38
60,557,61
71,557,72
77,557,78


# Export to JSON

In [76]:
json_file = result.to_json('test result.json',orient='records')