# 1. Import you data and perform basic data exploration phase.

In [1]:
import pandas as pd

# Specify the columns to load
columns_to_load = ["REGION", "TENURE", "MONTANT", "FREQUENCE_RECH", "REVENUE", "ARPU_SEGMENT", 
                   "FREQUENCE", "DATA_VOLUME", "ON_NET", "ORANGE", "TIGO", "ZONE1", 
                   "ZONE2", "MRG", "REGULARITY", "TOP_PACK", "FREQ_TOP_PACK", "CHURN"]

# Load data with Pandas
df = pd.read_csv("Expresso_churn_dataset.csv", usecols=columns_to_load)

# Drop the 'user_id' column if it's included in the data
if "user_id" in df.columns:
    df.drop(columns=["user_id"], inplace=True)

# Save the cleaned dataframe to 'data' variable if needed for further processing
data = df

# Display the cleaned dataframe
data

Unnamed: 0,REGION,TENURE,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,ORANGE,TIGO,ZONE1,ZONE2,MRG,REGULARITY,TOP_PACK,FREQ_TOP_PACK,CHURN
0,FATICK,K > 24 month,4250.0,15.0,4251.0,1417.0,17.0,4.0,388.0,46.0,1.0,1.0,2.0,NO,54,On net 200F=Unlimited _call24H,8.0,0
1,,I 18-21 month,,,,,,,,,,,,NO,4,,,1
2,,K > 24 month,3600.0,2.0,1020.0,340.0,2.0,,90.0,46.0,7.0,,,NO,17,On-net 1000F=10MilF;10d,1.0,0
3,DAKAR,K > 24 month,13500.0,15.0,13502.0,4501.0,18.0,43804.0,41.0,102.0,2.0,,,NO,62,"Data:1000F=5GB,7d",11.0,0
4,DAKAR,K > 24 month,1000.0,1.0,985.0,328.0,1.0,,39.0,24.0,,,,NO,11,Mixt 250F=Unlimited_call24H,2.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2154043,,K > 24 month,,,,,,,,,,,,NO,6,,,0
2154044,THIES,K > 24 month,6100.0,15.0,5800.0,1933.0,15.0,621.0,26.0,40.0,40.0,,,NO,55,"Data: 200 F=100MB,24H",9.0,0
2154045,,K > 24 month,,,,,,,,,,,,NO,1,,,1
2154046,THIES,K > 24 month,10000.0,11.0,7120.0,2373.0,13.0,,0.0,140.0,13.0,,,NO,28,All-net 500F=2000F;5d,12.0,0


### - Display general information about the dataset.

In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2154048 entries, 0 to 2154047
Data columns (total 18 columns):
 #   Column          Dtype  
---  ------          -----  
 0   REGION          object 
 1   TENURE          object 
 2   MONTANT         float64
 3   FREQUENCE_RECH  float64
 4   REVENUE         float64
 5   ARPU_SEGMENT    float64
 6   FREQUENCE       float64
 7   DATA_VOLUME     float64
 8   ON_NET          float64
 9   ORANGE          float64
 10  TIGO            float64
 11  ZONE1           float64
 12  ZONE2           float64
 13  MRG             object 
 14  REGULARITY      int64  
 15  TOP_PACK        object 
 16  FREQ_TOP_PACK   float64
 17  CHURN           int64  
dtypes: float64(12), int64(2), object(4)
memory usage: 295.8+ MB


### - Handle Missing and corrupted values.

In [3]:
# Check for missing values
print(data.isna().sum())

# Separate numerical and categorical columns
numerical_cols = data.select_dtypes(include=["int64", "float64"]).columns
categorical_cols = data.select_dtypes(include=["object", "category"]).columns

stats = data.describe()

# Fill missing values for numerical columns with the mean
for col in numerical_cols:
    mean = stats[col].values[1]
    print(f"The mean of {col} is {mean}")
    data[col] = data[col].fillna(mean)

# Calculate the mode for each categorical column
modes = {}
for col in categorical_cols:
    modes[col] = data[col].mode()[0]

# Fill the missing values in each categorical column using its mode
for col in categorical_cols:
    data[col].fillna(modes[col], inplace=True)

# Verify that there are no remaining missing values
print(data.isna().sum())

REGION             849299
TENURE                  0
MONTANT            756739
FREQUENCE_RECH     756739
REVENUE            726048
ARPU_SEGMENT       726048
FREQUENCE          726048
DATA_VOLUME       1060433
ON_NET             786675
ORANGE             895248
TIGO              1290016
ZONE1             1984327
ZONE2             2017224
MRG                     0
REGULARITY              0
TOP_PACK           902594
FREQ_TOP_PACK      902594
CHURN                   0
dtype: int64
The mean of MONTANT is 5532.11699774352
The mean of FREQUENCE_RECH is 11.529119901181485
The mean of REVENUE is 5510.810334033614
The mean of ARPU_SEGMENT is 1836.942893557423
The mean of FREQUENCE is 13.978141456582634
The mean of DATA_VOLUME is 3366.4501666491406
The mean of ON_NET is 277.6891404174282
The mean of ORANGE is 95.41871067683508
The mean of TIGO is 23.109252898040815
The mean of ZONE1 is 8.170132158071187
The mean of ZONE2 is 7.553309360930831
The mean of REGULARITY is 28.0425050880946
The mean of F

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(modes[col], inplace=True)


REGION            0
TENURE            0
MONTANT           0
FREQUENCE_RECH    0
REVENUE           0
ARPU_SEGMENT      0
FREQUENCE         0
DATA_VOLUME       0
ON_NET            0
ORANGE            0
TIGO              0
ZONE1             0
ZONE2             0
MRG               0
REGULARITY        0
TOP_PACK          0
FREQ_TOP_PACK     0
CHURN             0
dtype: int64


### - Remove duplicates, if they exist.

In [4]:
# Check for duplicates
duplicates = data.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

Number of duplicate rows: 650544


In [5]:
data = data.drop_duplicates()

In [6]:
duplicates_after = data.duplicated().sum()
print(f"Number of duplicate rows after removal: {duplicates_after}")

Number of duplicate rows after removal: 0


### - Handle outliers, if they exist.
    No outliers present.

### - Encode categorical features.

In [7]:
categorical_cols

Index(['REGION', 'TENURE', 'MRG', 'TOP_PACK'], dtype='object')

In [8]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder() #instantiate the label encoder class

for col in categorical_cols: 
    print(f"Encoding {col}") 
    data.loc[:, col] = encoder.fit_transform(data[col])

Encoding REGION
Encoding TENURE
Encoding MRG
Encoding TOP_PACK


# 2. Based on the previous data exploration train and test a machine learning classifier

In [9]:
# Assume df is your DataFrame
sampled_data = data.sample(n=100000)  # Sample 100000 rows

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Split data into train and test sets
features = sampled_data.drop("CHURN", axis=1)
label = sampled_data["CHURN"]

x_train, x_test, y_train, y_test = train_test_split(features, label, test_size=0.2)

# Train a classifier
clf = RandomForestClassifier()
clf.fit(x_train, y_train)

In [11]:
# Train the model on the training set
clf = RandomForestClassifier()
clf.fit(x_train, y_train)

# Evaluate the model on the validation set
y_pred = clf.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy}")

Model Accuracy: 0.9421


In [12]:
import pickle

# Save the model to a file
with open("model.pkl", "wb") as model_file:
    pickle.dump(clf, model_file)