### Social Network for Advertisement

This dataset tells about user transaction data that will purchased/not purchased a particular product.

In [28]:
# Library
import pandas as pd
import numpy
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score

In [2]:
# Collecting the data
dataset = pd.read_csv('Social_Network_Ads.csv')
dataset.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
User ID            400 non-null int64
Gender             400 non-null object
Age                400 non-null int64
EstimatedSalary    400 non-null int64
Purchased          400 non-null int64
dtypes: int64(4), object(1)
memory usage: 15.8+ KB


### Preprocessing dataset

First, we will remove User ID.
We need to analyze correlation between parameter. So you need to know value correlation coefficient in R table. It will help you to decide purchased column which has correlation parameter or not.

In [5]:
# Remove User ID
data = dataset.drop(['User ID'], axis=1)

In [6]:
# Change Gender Value
replace_dict= {'Female':0,'Male':1}
data['Gender']=data['Gender'].map(replace_dict)

In [7]:
data.head()

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
0,1,19,19000,0
1,1,35,20000,0
2,0,26,43000,0
3,0,27,57000,0
4,1,19,76000,0


In [10]:
# Correlation Coeff Gender
G = numpy.corrcoef(data['Gender'], data['Purchased'])[0, 1]

In [11]:
# Correlation Coeff Age
A = numpy.corrcoef(data['Age'], data['Purchased'])[0, 1]

In [12]:
# Correlation Coeff Estimated Salary
E = numpy.corrcoef(data['EstimatedSalary'], data['Purchased'])[0, 1]

In [13]:
r_table = 0.098742

In [14]:
if G >= r_table:
    print("Gender Valid "+str(G))
else:
    print("Gender Not Valid "+str(G))

Gender Not Valid -0.04246945626450915


In [15]:
if A >= r_table:
    print("Age Valid "+str(A))
else:
    print("Age Not Valid "+str(A))

Age Valid 0.6224541988845291


In [16]:
if E >= r_table:
    print("Estimated Salary Valid "+str(E))
else:
    print("Estimated Salary Not Valid "+str(E))

Estimated Salary Valid 0.36208302580467916


In [18]:
# Remove Gender
data = data.drop(['Gender'], axis=1)

In [19]:
data.head()

Unnamed: 0,Age,EstimatedSalary,Purchased
0,19,19000,0
1,35,20000,0
2,26,43000,0
3,27,57000,0
4,19,76000,0


In [21]:
# Create Data Model
data_model = data.values
X, y = data_model[:,:-1], data_model[:,-1]

In [22]:
# Split data model - data train & data test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
print(len(X_train),len(X_test),len(y_train),len(y_test))

300 100 300 100


In [24]:
# Standard Scaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [25]:
X_train[0:5,:]

array([[ 0.58164944, -0.88670699],
       [-0.60673761,  1.46173768],
       [-0.01254409, -0.5677824 ],
       [-0.60673761,  1.89663484],
       [ 1.37390747, -1.40858358]])

In [27]:
# Train Logistic Regression
clf = LogisticRegression(random_state=0).fit(X_train, y_train)
y_predict = clf.predict(X_test)

In [31]:
# Result and Accuracy
print(classification_report(y_test, y_predict))
accuracy_score(y_test,y_predict)

              precision    recall  f1-score   support

           0       0.89      0.96      0.92        68
           1       0.89      0.75      0.81        32

    accuracy                           0.89       100
   macro avg       0.89      0.85      0.87       100
weighted avg       0.89      0.89      0.89       100



0.89