In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report , accuracy_score
from sklearn.model_selection import cross_val_score
from joblib import dump , load
import json

In [2]:
data = pd.read_csv('../data/top500_data.csv')

In [3]:
data.drop(columns=[
                   'latitude',
                   'longitude',
                   'zip_code',
                   'id',
                   'name',
                   'labels',
                   'object_id',
                   'status',
                   'milestones',
                   ], inplace=True)
for col in data.columns:
    if (col.startswith('is_') and col.endswith('500')==False ) or (col.startswith('has_') and col.endswith('angel')==False) or col.startswith('age_') or col.endswith('_at') or col.startswith('state_code')or col.startswith('Unnamed') :
        data.drop(columns=[col], inplace=True)

In [4]:
data.rename(columns={
    'city':'Ville',
    'relationships':'Nombre_relations',
    'funding_total_usd':'Capitale_fonds',
    'funding_rounds':'Tours_financement',
    'category_code':'Categorie',
    'has_angel':'Investisseurs_providentiels',
    'avg_participants':'Nombre_participants',
    'is_top500':'top500',
}, inplace=True)

In [5]:
data.shape

(923, 8)

In [6]:
data.drop_duplicates(inplace=True)
data.dropna(inplace=True)
data.shape

(922, 8)

In [7]:
with open('../data/moroccan_cities.json', 'r') as f:
    city_replacement = json.load(f)

data['Ville']=data['Ville'].replace(city_replacement)
data.drop(data[data['Ville'] == 'San Franciso'].index, inplace=True)
data.drop(data[data['Ville'] == 'El Segundo,'].index, inplace=True)

data.head()

Unnamed: 0,Ville,Nombre_relations,Tours_financement,Capitale_fonds,Categorie,Investisseurs_providentiels,Nombre_participants,top500
0,Casablanca,3,3,375000,music,1,1.0,0
1,Rabat,9,4,40100000,enterprise,0,4.75,1
2,Casablanca,5,1,2600000,web,0,4.0,1
3,Marrakech,5,3,40000000,software,0,3.3333,1
4,Fez,2,2,1300000,games_video,1,1.0,1


In [8]:
dump(data['Ville'].unique() , './top500/ville_valeurs.pkl')
dump(data['Categorie'].unique() , './top500/categorie_valeurs.pkl')

['./top500/categorie_valeurs.pkl']

In [9]:

encoders = {
    'Ville': LabelEncoder(),
    'Categorie': LabelEncoder(),
}
for col, encoder in encoders.items():
    data[col] = encoder.fit_transform(data[col])
    dump(encoder, f'./top500/{col}.pkl')
    
print("Encoders saved successfully!")
data.head()

Encoders saved successfully!


Unnamed: 0,Ville,Nombre_relations,Tours_financement,Capitale_fonds,Categorie,Investisseurs_providentiels,Nombre_participants,top500
0,8,3,3,375000,19,1,1.0,0
1,32,9,4,40100000,8,0,4.75,1
2,8,5,1,2600000,34,0,4.0,1
3,23,5,3,40000000,30,0,3.3333,1
4,14,2,2,1300000,11,1,1.0,1


In [10]:
X = data.drop(['top500'] , axis=1)
y = data['top500']

In [11]:
dump(X.columns, './top500/top500_columns.pkl')

['./top500/top500_columns.pkl']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
model=KNeighborsClassifier(n_neighbors=3)
model.fit(X_train,y_train)

In [14]:
y_pred = model.predict(X_test)
print("KNeighbors Classifier")
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(cross_val_score(model, X, y, cv=5).mean())

KNeighbors Classifier
0.8097826086956522
              precision    recall  f1-score   support

           0       0.44      0.34      0.39        32
           1       0.87      0.91      0.89       152

    accuracy                           0.81       184
   macro avg       0.65      0.63      0.64       184
weighted avg       0.79      0.81      0.80       184

0.7989130434782609


In [15]:
dump(model, './top500/top500_model.pkl')
print("Model saved")

Model saved


*testing model*

In [16]:
model = load('./top500/top500_model.pkl')
print("Model loaded successfully")

data = pd.DataFrame([
    {
        'Ville': 'Casablanca',
        'Nombre_relations': 3,
        'Tours_financement': 3,
        'Capitale_fonds': 375000,
        'Categorie': 'music',
        'Investisseurs_providentiels': 1,
        'Nombre_participants': 1.0000,
    },
    {
        'Ville': 'Rabat',
        'Nombre_relations': 9,
        'Tours_financement': 4,
        'Capitale_fonds': 40100000,
        'Categorie': 'enterprise',
        'Investisseurs_providentiels': 0,
        'Nombre_participants': 4.7500,
    }
])

columns_to_encode = ['Ville', 'Categorie']

for col in columns_to_encode:
    encoder = load(f"./top500/{col}.pkl")
    data[col] = encoder.transform(data[col])

predictions = model.predict(data)
print(predictions)

Model loaded successfully
[0 1]
