### THE BUSINESS QUESTION THIS PROJECT INTENDS TO ANSWER: Given a Bank customer, can we build a classifier which can determine whether they will leave in the next 6 months or not?

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('Churn_Modelling.csv')

In [3]:
# Display a subset of the dataset
df.sample(5)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
6405,6406,15760749,Vinogradov,509,Spain,Male,41,7,126683.8,1,0,1,114775.53,0
6781,6782,15714874,Major,850,France,Female,42,3,0.0,2,1,1,176883.42,0
9333,9334,15714680,Bianchi,755,France,Female,78,5,121206.96,1,1,1,76016.49,0
2164,2165,15794479,Becker,767,Spain,Male,77,8,149083.7,1,1,1,190146.83,0
4691,4692,15758050,Madukwe,622,Spain,Male,37,4,0.0,2,1,0,4459.5,0


In [4]:
#  Drop columns not important for the analysis
df1 = df.drop(['RowNumber','CustomerId','Surname'],axis='columns')

In [5]:
df1.sample(5)

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
9905,645,Germany,Male,41,2,93925.3,1,1,0,123982.14,1
2306,697,France,Female,34,2,126558.92,1,1,0,73334.43,0
5615,520,France,Female,29,8,95947.76,1,1,0,4696.44,0
5228,751,France,Male,29,10,147737.63,1,0,1,94951.27,0
5242,655,Germany,Female,40,0,81954.6,1,1,1,198798.44,1


In [6]:
#  Display the value types of each columns
df1.dtypes

CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object

In [12]:
# function that displays the unique values of each column
def print_unique_cols(df1):
    for col in df1:
        print(f'{col} : {df1[col].unique()}')

In [13]:
print_unique_cols(df1)

CreditScore : [619 608 502 699 850 645 822 376 501 684 528 497 476 549 635 616 653 587
 726 732 636 510 669 846 577 756 571 574 411 591 533 553 520 722 475 490
 804 582 472 465 556 834 660 776 829 637 550 698 585 788 655 601 656 725
 511 614 742 687 555 603 751 581 735 661 675 738 813 657 604 519 664 678
 757 416 665 777 543 506 493 652 750 729 646 647 808 524 769 730 515 773
 814 710 413 623 670 622 785 605 479 685 538 562 721 628 668 828 674 625
 432 770 758 795 686 789 589 461 584 579 663 682 793 691 485 650 754 535
 716 539 706 586 631 717 800 683 704 615 667 484 480 578 512 606 597 778
 514 525 715 580 807 521 759 516 711 618 643 671 689 620 676 572 695 592
 567 694 547 594 673 610 767 763 712 703 662 659 523 772 545 634 739 771
 681 544 696 766 727 693 557 531 498 651 791 733 811 707 714 782 775 799
 602 744 588 747 583 627 731 629 438 642 806 474 559 429 680 749 734 644
 626 649 805 718 840 630 654 762 568 613 522 737 648 443 640 540 460 593
 801 611 802 745 483 690 492 709 705 

In [15]:
# Display the number of null rolls each columns has
df1.isnull().sum()

CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [24]:
# Display the number of empty string rolls each column has
df1.eq(' ').sum()

CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [25]:
# Display the shape of the dataset
df1.shape

(10000, 11)

In [28]:
df1['Age'].iloc[6697]

41

In [34]:
# Replace the gender column with 0 and 1
df1['Gender'] = df1['Gender'].replace({'Female':1,'Male':0})

In [35]:
df1.sample(2)

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
9166,627,Germany,1,35,2,137852.96,1,1,1,172269.21,1
3100,810,Germany,0,35,3,96814.46,2,1,1,120511.03,0


In [45]:
# Check the value_counts of our target column
df1['Exited'].value_counts()

0    7963
1    2037
Name: Exited, dtype: int64

In [46]:
7963/10000

0.7963

In [47]:
2037/10000

0.2037

### As shown in the value count, our dataset is imbalanced but we wont try to fix it now

In [48]:
df1.sample(2)

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
5393,835,France,0,32,8,124993.29,2,1,1,27548.06,0
9715,680,Spain,1,34,7,0.0,2,1,0,98949.85,0


In [49]:
df2 = pd.get_dummies(data=df1,columns=['Geography'])

## 