In [4]:
import pandas as pd

In [5]:
data = pd.read_csv('transport2.csv')

In [6]:
data.columns

Index(['num', 'age', 'sexe', 'sitfam', 'Principal means', 'Car', 'Bike',
       'Public transport', 'revenu'],
      dtype='object')

In [7]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler

In [8]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [9]:
encoder = LabelEncoder()
data["sexe"] = encoder.fit_transform(data["sexe"])

In [10]:
data = data.apply(lambda x: x.fillna(x.median()),axis=0)

In [11]:
X = data.drop(['Public transport', 'num', 'Principal means'], axis=1)

In [12]:
y = data['Public transport']

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [15]:
numeric_columns = X_train.select_dtypes(exclude='object').columns
categorical_columns = X_train.select_dtypes(include='object').columns

In [16]:
numeric_features = Pipeline([
    ('handlingmissingvalues',SimpleImputer(strategy='median')),
    ('scaling', MinMaxScaler())
])

categorical_features = Pipeline([
    ('handlingmissingvalues',SimpleImputer(strategy='most_frequent')),
    ('encoding', OneHotEncoder()),
    ('scaling', MinMaxScaler())
])

processing = ColumnTransformer([
    ('numeric', numeric_features, numeric_columns),
    ('categorical', categorical_features, categorical_columns)
])

In [17]:
def prepare_model(algorithm):
    model = Pipeline(steps= [
        ('processing',processing),
        ('modeling', algorithm)
    ])
    model.fit(X_train, y_train)
    return model

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier

In [19]:
my_model = prepare_model(AdaBoostClassifier())

In [20]:
my_model.score(X_test, y_test)

0.6

In [21]:
import joblib
joblib.dump(my_model, 'model_scaler.pkl')

['model_scaler.pkl']

In [22]:
import numpy as np

In [23]:
input = pd.DataFrame(np.array([45, 0, 3, 1, 1, 1500.0]).reshape(1, -1), columns = ['age', 'sexe', 'sitfam', 'Car', 'Bike', 'revenu'])
my_model.predict(input)

array([0])

In [24]:
def return_preds(age, sex, num_fam_members, car, bike, revenue):
  input = pd.DataFrame(np.array([age, sex, num_fam_members, car, bike, revenue]).reshape(1, -1), columns = ['age', 'sexe', 'sitfam', 'Car', 'Bike', 'revenu'])
  prediction = my_model.predict(input)

  return prediction

In [25]:
return_preds(35, 0, 10, 1, 0, 150)

array([0])