# Internet Usage prediction based on demografy data 

### Imports

In [68]:
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt

In [69]:
df_internet = pd.read_excel('https://github.com/jsulopzs/data/blob/main/internet_usage_spain.xlsx?raw=true', 
                            sheet_name=1, index_col=0)
df_internet.head(10)

Unnamed: 0_level_0,internet_usage,sex,age,education
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Josefina,0,Female,66,Elementary
Vicki,1,Male,72,Elementary
David,1,Male,48,University
Curtis,0,Male,59,PhD
Josephine,1,Female,44,PhD
Malinda,1,Female,34,PhD
Fern,1,Female,49,PhD
Sue,1,Male,43,University
Juanita,1,Male,23,Elementary
Robert,1,Male,29,Higher Level


In [70]:
df_internet.shape

(2455, 4)

## Data Preprocesing

In [71]:
df_internet.isnull().sum()

internet_usage    0
sex               0
age               0
education         0
dtype: int64

In [72]:
df_internet= pd.get_dummies(data = df_internet,  drop_first = True)

In [73]:
df_internet.head(10)

Unnamed: 0_level_0,internet_usage,age,sex_Male,education_High School,education_Higher Level,education_No studies,education_PhD,education_University
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Josefina,0,66,0,0,0,0,0,0
Vicki,1,72,1,0,0,0,0,0
David,1,48,1,0,0,0,0,1
Curtis,0,59,1,0,0,0,1,0
Josephine,1,44,0,0,0,0,1,0
Malinda,1,34,0,0,0,0,1,0
Fern,1,49,0,0,0,0,1,0
Sue,1,43,1,0,0,0,0,1
Juanita,1,23,1,0,0,0,0,0
Robert,1,29,1,0,1,0,0,0


In [74]:
target = df_internet.internet_usage
features = df_internet.drop(columns = "internet_usage")

# Models 

In [75]:
# Imports 
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [76]:
model_dt = DecisionTreeClassifier()
model_sv = SVC(probability=True)
model_lr = LogisticRegression(max_iter=1000)

In [77]:
def calculate_score(model): 
    model.fit(X= features, y = target)
    result = model.score(X= features, y = target)
    return result

In [78]:
dict_score = {}
dict_score["Decision Tree"] = calculate_score(model_dt)
dict_score["SVC"] = calculate_score(model_sv)
dict_score["Logistic Regression"] = calculate_score(model_lr)

In [79]:
model_accuracy = pd.Series(dict_score).sort_values(ascending = False)
model_accuracy

Decision Tree          0.859878
Logistic Regression    0.833401
SVC                    0.783707
dtype: float64

The decision tree is the best model with an socre of 85%
We have calculated the models accuracy using the same data to train the model. 
We need to separate the data between training and test to avoid overfitting. 
So we will separate the data into 70% for trining and the remaining 30% for testing 

### Train and Test data 

In [102]:
from sklearn.model_selection import train_test_split


In [103]:
X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size=0.30, random_state=42)

In [104]:
def calculate_score_test(model):

    model.fit(X_train, y_train)
    result = model.score(X_test, y_test)

    return result


In [107]:
dict_score_test = {}
dict_score_test["Decision Tree"] = calculate_score_test(model_dt)
dict_score_test["SVC"] = calculate_score_test(model_sv)
dict_score_test["Logistic Regression"] = calculate_score_test(model_lr)

In [108]:
dict_score_test

{'Decision Tree': 0.8046132971506106,
 'SVC': 0.7788331071913162,
 'Logistic Regression': 0.8548168249660787}

In [110]:
model_accuracy_test = pd.Series(dict_score_test).sort_values(ascending = False)
model_accuracy_test

Logistic Regression    0.854817
Decision Tree          0.804613
SVC                    0.778833
dtype: float64

In [112]:
df_score = pd.DataFrame({"Same Data": model_accuracy, 
                         "Test Data": model_accuracy_test})
df_score

Unnamed: 0,Same Data,Test Data
Decision Tree,0.859878,0.804613
Logistic Regression,0.833401,0.854817
SVC,0.783707,0.778833


The best model to predict the use of internet based on the existing data is logistic regression with 85% accuracy