## Data loading

### Subtask:
Load the telecom churn dataset.


In [None]:
import pandas as pd

df = pd.read_csv('telecom_churn.csv')
display(df.head())

Unnamed: 0,customer_id,telecom_partner,gender,age,state,city,pincode,date_of_registration,num_dependents,estimated_salary,calls_made,sms_sent,data_used,churn
0,1,Reliance Jio,F,25,Karnataka,Kolkata,755597,2020-01-01,4.0,124962.0,44.0,45.0,-361.0,0.0
1,2,Reliance Jio,F,55,Mizoram,Mumbai,125926,2020-01-01,2.0,130556.0,62.0,39.0,5973.0,0.0
2,3,Vodafone,F,57,Arunachal Pradesh,Delhi,423976,2020-01-01,0.0,148828.0,49.0,24.0,193.0,1.0
3,4,BSNL,M,46,Tamil Nadu,Kolkata,522841,2020-01-01,1.0,38722.0,80.0,25.0,9377.0,1.0
4,5,BSNL,F,26,Tripura,Delhi,740247,2020-01-01,2.0,55098.0,78.0,15.0,1393.0,0.0


## Data exploration

### Subtask:
Explore the "pincode" and "num_dependents" attributes.


In [None]:
# Data Type and Missing Valuesؤء
print(df[['pincode', 'num_dependents']].dtypes)
print(df[['pincode', 'num_dependents']].isnull().sum())
print(df['pincode'].unique())
print(df['pincode'].value_counts())
print(df['num_dependents'].describe())


pincode             int64
num_dependents    float64
dtype: object
pincode           0
num_dependents    1
dtype: int64
[755597 125926 423976 ... 245560 891528 517704]
pincode
304785    3
818851    3
957775    3
446566    3
202102    3
         ..
683189    1
916170    1
129866    1
749916    1
356397    1
Name: count, Length: 27028, dtype: int64
count    27436.000000
mean         1.991362
std          1.414947
min          0.000000
25%          1.000000
50%          2.000000
75%          3.000000
max          4.000000
Name: num_dependents, dtype: float64


**Reasoning**:
Analyze the potential issues in "pincode" and "num_dependents" based on the insights obtained in the previous step.



In [None]:
# Potential Issues
# Check for incorrect formats or unreasonable values in "pincode"
# (Since pincode is treated as an integer, it's assumed that the format is correct.)
# However, we can check if there are any unrealistic pincodes (e.g., negative values).
print(df[df['pincode'] < 0])  # Check for negative pincodes

# Check for negative or extremely large values in "num_dependents"
print(df[df['num_dependents'] < 0])  # Check for negative num_dependents
print(df[df['num_dependents'] > 10])  # Check for extremely large num_dependents

Empty DataFrame
Columns: [customer_id, telecom_partner, gender, age, state, city, pincode, date_of_registration, num_dependents, estimated_salary, calls_made, sms_sent, data_used, churn]
Index: []
Empty DataFrame
Columns: [customer_id, telecom_partner, gender, age, state, city, pincode, date_of_registration, num_dependents, estimated_salary, calls_made, sms_sent, data_used, churn]
Index: []
Empty DataFrame
Columns: [customer_id, telecom_partner, gender, age, state, city, pincode, date_of_registration, num_dependents, estimated_salary, calls_made, sms_sent, data_used, churn]
Index: []


## Data cleaning

### Subtask:
Handle missing values in the "pincode" and "num_dependents" columns.


In [None]:
# Check for missing values in 'pincode' and 'num_dependents'
print(df[['pincode', 'num_dependents']].isnull().sum())

# Impute missing values if any
if df['pincode'].isnull().any():
  df['pincode'] = df['pincode'].fillna(df['pincode'].mode()[0])  # Replace with mode

if df['num_dependents'].isnull().any():
  df['num_dependents'] = df['num_dependents'].fillna(df['num_dependents'].median())  # Replace with median

# Verify no missing values remain
print(df[['pincode', 'num_dependents']].isnull().sum())

pincode           0
num_dependents    0
dtype: int64
pincode           0
num_dependents    0
dtype: int64


## Data wrangling

### Subtask:
Wrangle the "pincode" and "num_dependents" columns to ensure they are in a suitable format for analysis.


In [None]:
# Pincode Formatting
df['pincode'] = df['pincode'].astype(str).str.strip()  # Remove leading/trailing spaces
df['pincode'] = df['pincode'].str.replace(r'\D', '', regex=True)  # Remove non-numeric characters
df['pincode'] = pd.to_numeric(df['pincode'], errors='coerce')  # Convert to numeric, handle errors
df['pincode'] = df['pincode'].fillna(-1).astype(int)  # Fill NaN with -1 and convert to int
df['pincode'] = df['pincode'].apply(lambda x: x if 100000 <= x <= 999999 else -1)  # Ensure 6-digit range

# Num_dependents Type Conversion
df['num_dependents'] = pd.to_numeric(df['num_dependents'], errors='coerce')  # Convert to numeric
df['num_dependents'] = df['num_dependents'].fillna(-1).astype(int)  # Fill NaN with -1 and convert to int
df['num_dependents'] = df['num_dependents'].apply(lambda x: x if x >= 0 else -1)  # Handle negative values

# Verify Changes
print(df[['pincode', 'num_dependents']].dtypes)
print(df[['pincode', 'num_dependents']].head())

pincode           int64
num_dependents    int64
dtype: object
   pincode  num_dependents
0   755597               4
1   125926               2
2   423976               0
3   522841               1
4   740247               2


In [None]:
 relation= df['num_dependents'].apply(lambda x: 'Single' if x == 0 else 'Married')
 print(relation)

0        Married
1        Married
2         Single
3        Married
4        Married
          ...   
27432    Married
27433    Married
27434     Single
27435    Married
27436    Married
Name: num_dependents, Length: 27437, dtype: object


In [None]:
df['relation'] = df['num_dependents'].apply(lambda x: 'Single' if x == 0 else 'Married')
print(df['relation'].value_counts())

relation
Married    21891
Single      5546
Name: count, dtype: int64


In [None]:
  df.groupby('gender')['num_dependents'].value_counts().unstack().fillna(0)
print(  df.groupby('gender')['num_dependents'].value_counts().unstack().fillna(0))

num_dependents   0.0   1.0   2.0   3.0   4.0
gender                                      
F               2200  2273  2282  2215  2189
M               3346  3227  3198  3250  3256


In [None]:
relation_table = df.groupby('relation')['calls_made'].value_counts().unstack().fillna(0)
print(relation_table)

calls_made  -10.0   -9.0    -8.0    -7.0    -6.0    -5.0    -4.0    -3.0    \
relation                                                                     
Married         19      22      31      40      56      66      76      92   
Single           5       7       5      12      12      16      21      27   

calls_made  -2.0    -1.0     0.0     1.0     2.0     3.0     4.0     5.0    \
relation                                                                     
Married         91      97     116     134     152     152     188     184   
Single          33      30      43      30      30      50      42      49   

calls_made   6.0     7.0     8.0     9.0     10.0    11.0    12.0    13.0   \
relation                                                                     
Married        195     189     198     210     206     212     243     226   
Single          42      54      55      54      67      52      53      50   

calls_made   14.0    15.0    16.0    17.0    18.0    19.0    