In [None]:
!pip install wget

Collecting wget
  Downloading wget-3.2.zip (10 kB)
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9675 sha256=ee3d728ea3bdecfff7e9b5d365ee6f70ca5109490ef3fafc3852b2c103770200
  Stored in directory: /root/.cache/pip/wheels/a1/b6/7c/0e63e34eb06634181c63adacca38b79ff8f35c37e3c13e3c02
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import wget
import os

In [None]:
class Dataset():
  def __init__(self, dataset_name):
    # initialisation with df, df_train,df_test
    self.df = self.load_dataset(dataset_name)
    self.init_ml()

  def init_ml(self):
    # we could define them only in the functions
    # for readability we initialize them in this sub init function
    self.df_train = None
    self.df_test = None
    self.X_train = None
    self.X_test = None
    self.Y_train = None
    self.Y_test = None
  
  def load_dataset(self,dataset_name):
    '''
    load the dataset provided by its name (takes care of typos :))
    '''
    if re.match("i|Ir*",dataset_name,re.IGNORECASE):
      return self.load_iris()

  def load_iris(self, iris_path = "/content/iris.data"):
    # if the file does not exist it wil download it
    if not os.path.exists(iris_path):
      wget.download(url ="https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data")  
    # read the file without header and with the comma as separator
    df = pd.read_csv(iris_path, header=None, sep=',')
    # the columns to add to the dataframe
    df.columns =["sepal_length", "sepal_width", "petal_length", "petal_width", "class"]
    return df

  def transform_data(self, col_name):
    '''
    converts the column into categories
    '''
    self.df[col_name] = self.df[col_name].astype('category').cat.codes


  def train_test_split(self,train_size =0.8):
    '''
    function which splits the dataset into train & test dataframes
    '''
    self.df_train, self.df_test = train_test_split(self.df)

  def prepare_for_ml(self, target_col_name="class"):
    '''
    function which prepares X_train,Y_train, X_test, Y_test
    '''
    # this function can only run if the train & test dataframes have been loaded
    assert self.df_train.shape[0] > 0, "df_train was not created, run train_test_split before"
    assert self.df_test.shape[0] > 0, "df_test was not created, run train_test_split before"

    # we create X_train and X_test by dropping a column
    self.X_train = self.df_train.drop(target_col_name,axis=1)
    self.Y_train = self.df_train[target_col_name]
    self.X_test = self.df_test.drop(target_col_name, axis=1)
    self.Y_test = self.df_test[target_col_name]


  def __repr__(self):
    return(f"iris dataset, {self.df_train.shape[0]} train observations & {self.df_test.shape[0]} test observations")

In [None]:
iris_dataset = Dataset(dataset_name = 'iris')

In [None]:
iris_dataset.transform_data("class")

In [None]:
iris_dataset.train_test_split()

In [None]:
iris_dataset.prepare_for_ml()

In [None]:
iris_dataset.X_train

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
20,5.4,3.4,1.7,0.2
124,6.7,3.3,5.7,2.1
102,7.1,3.0,5.9,2.1
80,5.5,2.4,3.8,1.1
127,6.1,3.0,4.9,1.8
...,...,...,...,...
65,6.7,3.1,4.4,1.4
128,6.4,2.8,5.6,2.1
70,5.9,3.2,4.8,1.8
88,5.6,3.0,4.1,1.3


In [None]:
iris_dataset.Y_train

20     0
124    2
102    2
80     1
127    2
      ..
65     1
128    2
70     1
88     1
85     1
Name: class, Length: 112, dtype: int8

In [None]:
class ML_Model():

  def __init__(self,model_name):

    # pick the model based on the model_name
    self.load_model(model_name)
    self.score = 0
    self.trained = False

  def load_model(self, model_name, iterations= 1000):
    if re.match("(linear)\s{1,3}(re|ég)(ress?ion)?", model_name, re.IGNORECASE) :  
      self.model = self.load_lr(max_iterations = iterations)

  def load_lr(self, max_iterations):
    '''
    launch/instantiates a linear regression model with max_iterations
    '''

    lr = LogisticRegression(max_iter=max_iterations)
    return lr

  def fit(self,data):
    '''
    fit the model to the data and set an inner flag to True
    '''
    X = data.X_train
    y = data.Y_train
    self.model = self.model.fit(X, y)
    self.trained = True

  def predict_evaluate(self,data):
    '''
    predict and evaluate on the test set in a dataset
    '''
    X_test = data.X_test
    Y_test = data.Y_test
    predictions = self.model.predict(X_test)
    self.score = self.model.score(X_test, Y_test.values.reshape(-1,1))
    return self.score


  def __repr__(self):
    '''
    display our model via print()
    '''
    if self.trained:
      return(f"the model is of type :{self.model.__repr__()} and the score of the current trained model is : {self.score}")
    else:
      return self.model.__repr__()


In [None]:
# we instantiate a ML_model of type linear regression
lr1 = ML_Model("linear reg")
lr1.model.__repr__()

'LogisticRegression(max_iter=1000)'

In [None]:
# we fit a dataset (of type Dataset)
print(iris_dataset)
lr1.fit(iris_dataset)

iris dataset, 112 train observations & 38 test observations


In [None]:
lr1.predict_evaluate(iris_dataset)

0.9736842105263158

In [None]:
print(lr1)

the model is of type :LogisticRegression(max_iter=1000) and the score of the current trained model is : 0.9736842105263158
