In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data = pd.read_excel('mobile_network.xlsx')

In [3]:
data.isnull().sum()

REGION           0
DESIGNATION      0
Unnamed: 2       0
Unnamed: 3      10
FAULT LEVEL      8
VALUE            8
Unnamed: 6      10
OUTAGE CAUSE     0
VALUE.1          0
dtype: int64

In [4]:
!pip install xplore
from xplore.data import xplore



In [5]:
xplore(data)

------------------------------------
The fist 5 entries of your dataset are:

          REGION DESIGNATION  Unnamed: 2  ...  Unnamed: 6      OUTAGE CAUSE  VALUE.1
0        ASHANTI           A           0  ...         NaN  CHARGING NETWORK        0
1    BRONG AHAFO           B           1  ...         NaN              CORE        1
2        CENTRAL           C           2  ...         NaN             FIBER        2
3        EASTERN           D           3  ...         NaN          HARDWARE        3
4  GREATER ACCRA           E           4  ...         NaN  HIGH TEMPERATURE        4

[5 rows x 9 columns]


------------------------------------
The last 5 entries of your dataset are:

       REGION DESIGNATION  Unnamed: 2  ...  Unnamed: 6         OUTAGE CAUSE  VALUE.1
5    NORTHERN           F           5  ...         NaN         IP TRANSPORT        5
6  UPPER EAST           G           6  ...         NaN         MW TRANSPORT        6
7  UPPER WEST           H           7  ...         NaN 

<xplore.data.xplore at 0x7f49463e3278>

In [6]:
df = pd.DataFrame(data)
df

Unnamed: 0,REGION,DESIGNATION,Unnamed: 2,Unnamed: 3,FAULT LEVEL,VALUE,Unnamed: 6,OUTAGE CAUSE,VALUE.1
0,ASHANTI,A,0,,L1,0.0,,CHARGING NETWORK,0
1,BRONG AHAFO,B,1,,L2,1.0,,CORE,1
2,CENTRAL,C,2,,,,,FIBER,2
3,EASTERN,D,3,,,,,HARDWARE,3
4,GREATER ACCRA,E,4,,,,,HIGH TEMPERATURE,4
5,NORTHERN,F,5,,,,,IP TRANSPORT,5
6,UPPER EAST,G,6,,,,,MW TRANSPORT,6
7,UPPER WEST,H,7,,,,,PLN WORK,7
8,VOLTA,I,8,,,,,POWER,8
9,WESTERN,J,9,,,,,UNDER INVESTIGATION,9


In [None]:
## enter the feature columns you want to assing to the dependent and independent variables here

X = df[['']]
y = df[['']]

In [None]:
seed = 50  # so that the result is reproducible
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.333, random_state = seed)

In [None]:
X_train = X_train.fillna('na')
X_test = X_test.fillna('na')

In [None]:
X_train.dtypes

In [None]:
features_to_encode = list(X_train.select_dtypes(include = ['object']).columns) 
# Or alternatively, 
features_to_encode = X_train.columns[X_train.dtypes==object].tolist()

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
col_trans = make_column_transformer(
                        (OneHotEncoder(),features_to_encode),
                        remainder = "passthrough"
                        )

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(
                      min_samples_leaf=50,
                      n_estimators=150,
                      bootstrap=True,
                      oob_score=True,
                      n_jobs=-1,
                      random_state=seed,
                      max_features='auto')

In [None]:
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(col_trans, rf_classifier)
np.array(X_train, dtype=np.float32)
pipe.fit(X_train, y_train)

In [None]:
y_pred = pipe.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score

In [None]:
accuracy_score(y_test, y_pred)
print(f"The accuracy of the model is {round(accuracy_score(y_test,y_pred),3)*100} %")

In [None]:
train_probs = pipe.predict_proba(X_train)[:,1] 
probs = pipe.predict_proba(X_test)[:, 1]
train_predictions = pipe.predict(X_train)

In [None]:
print(f'Train ROC AUC Score: {roc_auc_score(y_train, train_probs)}')
print(f'Test ROC AUC  Score: {roc_auc_score(y_test, probs)}')

In [None]:
def evaluate_model(y_pred, probs,train_predictions, train_probs):
    baseline = {}
    baseline['recall']=recall_score(y_test,
                    [1 for _ in range(len(y_test))])
    baseline['precision'] = precision_score(y_test,
                    [1 for _ in range(len(y_test))])
    baseline['roc'] = 0.5
    results = {}
    results['recall'] = recall_score(y_test, y_pred)
    results['precision'] = precision_score(y_test, y_pred)
    results['roc'] = roc_auc_score(y_test, probs)
    train_results = {}
    train_results['recall'] = recall_score(y_train,       train_predictions)
    train_results['precision'] = precision_score(y_train, train_predictions)
    train_results['roc'] = roc_auc_score(y_train, train_probs)
    for metric in ['recall', 'precision', 'roc']:  
          print(f'{metric.capitalize()} 
                 Baseline: {round(baseline[metric], 2)} 
                 Test: {round(results[metric], 2)} 
                 Train: {round(train_results[metric], 2)}')
     # Calculate false positive rates and true positive rates
    base_fpr, base_tpr, _ = roc_curve(y_test, [1 for _ in range(len(y_test))])
    model_fpr, model_tpr, _ = roc_curve(y_test, probs)
    plt.figure(figsize = (8, 6))
    plt.rcParams['font.size'] = 16
    # Plot both curves
    plt.plot(base_fpr, base_tpr, 'b', label = 'baseline')
    plt.plot(model_fpr, model_tpr, 'r', label = 'model')
    plt.legend();
    plt.xlabel('False Positive Rate');
    plt.ylabel('True Positive Rate'); plt.title('ROC Curves');
    plt.show();
evaluate_model(y_pred,probs,train_predictions,train_probs)

In [None]:
print(rf_classifier.feature_importances_)
print(f" There are {len(rf_classifier.feature_importances_)} features in total")