## Imports:

In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from diabetes_prediction.diabetes_prediction_lib import DataLoader, NaNRemover, NaNFiller, EthnicityOneHotEncoder, GenderBinaryTransformer, Model


# Loading and Splitting Data:

In [2]:
loader = DataLoader('data/sample_diabetes_mellitus_data.csv')

In [3]:
train_df, test_df = loader.load_and_split()

# Pre-processing:

In [4]:
Remover = NaNRemover()
Filler = NaNFiller()

train_df_cleaned = Filler.fill_nan(Remover.remove_nan(train_df))
test_df_cleaned = Filler.fill_nan(Remover.remove_nan(test_df))

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].mean(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column].fillna(df[column].mean(), inplace=True)


# Feature Creation:

In [5]:
gender_transformer = GenderBinaryTransformer()
ethnicity_encoder = EthnicityOneHotEncoder()

train_df_transformed= ethnicity_encoder.transform(gender_transformer.transform(train_df_cleaned))
test_df_transformed= ethnicity_encoder.transform(gender_transformer.transform(test_df_cleaned))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['gender'] = df['gender'].map({'M': 1, 'F': 0})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['gender'] = df['gender'].map({'M': 1, 'F': 0})


# Model Training:

In [6]:
lg_model = Model(feature_columns=['age', 'gender', 'height', 'weight'], target_column='diabetes_mellitus',model=LogisticRegression())
lg_model.train(train_df_transformed)

# Predictions:

In [7]:
lg_predictions, lg_probs = lg_model.predict(test_df_transformed)

In [8]:
lg_predictions

6252    0
1731    0
4742    0
4521    0
6340    0
       ..
6412    0
8285    0
7853    0
1095    0
6929    0
Name: Predictions_DB, Length: 1874, dtype: int64

In [9]:

lg_predictions_replaced = lg_predictions.replace({0: 'Negative', 1: 'Positive'})
print(lg_predictions_replaced.value_counts())

Predictions_DB
Negative    1824
Positive      50
Name: count, dtype: int64


In [10]:
lg_probs 

Unnamed: 0,Prob_no_DM,Prob_has_DM
6252,0.678063,0.321937
1731,0.720801,0.279199
4742,0.878997,0.121003
4521,0.878299,0.121701
6340,0.633456,0.366544
...,...,...
6412,0.834030,0.165970
8285,0.813243,0.186757
7853,0.704653,0.295347
1095,0.744155,0.255845
