In [1]:
# Environment set up
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as stats
import seaborn as sns
from env import host, user, password

def get_db_url(user,host,password,dbname):
    url = f'mysql+pymysql://{user}:{password}@{host}/dbname'
    return url

dbname = 'titanic_db'
url = get_db_url(user,host,password,dbname)

# Data Acquisition Codes

### Clipboard

In [2]:
# df_clipboard=pd.read_clipboard()

### Excel

In [3]:
# df_excel=pd.read_excel('file_name.xls')

### *.csv

In [4]:
# df_csv=pd.read_csv('file_name.csv')

### SQL

In [5]:
# read_sql(sql_query, connection_url)

### Google Sheet

In [6]:
sheet_id='1Uhtml8KY19LILuZsrDtlsHHDC9wuDGUSe8LTEwvdI5g'
sheet_name='first_sheet_by_default'
google_sheet_url=f'https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}'

In [7]:
df_googlesheet = pd.read_csv(google_sheet_url)
df_googlesheet.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803.0,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450.0,8.05,,S


In [8]:
# for native google urls
# csv_export_url = sheet_url.replace('/edit#gid=', '/export?format=csv&gid=')

### AWS S3

In [9]:
df_s3 = pd.read_csv('https://s3.amazonaws.com/irs-form-990/index_2011.csv')
df_s3.head()

Unnamed: 0,RETURN_ID,FILING_TYPE,EIN,TAX_PERIOD,SUB_DATE,TAXPAYER_NAME,RETURN_TYPE,DLN,OBJECT_ID
0,9091250,EFILE,591971002,201009,11/30/2011 1:06:39 AM,ANGELUS INC,990,93493316003251,201103169349300325
1,9091274,EFILE,251713602,201106,11/30/2011 1:09:14 AM,TOUCH-STONE SOLUTIONS INC,990,93493313012311,201113139349301231
2,9091275,EFILE,232705170,201012,11/30/2011 1:09:16 AM,RONALD MCDONALD HOUSE CHARITIES- PHILADELPHIA ...,990,93493313013011,201113139349301301
3,9091276,EFILE,581805618,201106,11/30/2011 1:09:19 AM,TORRINGTON VOA ELDERLY HOUSING INC BELL PARK T...,990,93493313013111,201113139349301311
4,9091277,EFILE,581876019,201106,11/30/2011 1:09:21 AM,HOUSTON VOA INDEPENDENT HOUSING INC HEIGHTS MANOR,990,93493313013161,201113139349301316


### SQL

In [10]:
import env
def get_connection(db, user=env.user, host=env.host, password=env.password):
    return f'mysql+pymysql://{user}:{password}@{host}/{db}'

df = pd.read_sql('SELECT * FROM passengers', get_connection('titanic_db'))

df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


### Data Caching Data in a Cache

In [11]:
df.to_csv('titanic.csv')

### Data Caching Workflow:

In [12]:
import os

def get_titanic_data():
    filename = "titanic.csv"

    if os.path.isfile(filename):
        return pd.read_csv(filename)
    else:
        # read the SQL query into a dataframe
        df = pd.read_sql('SELECT * FROM passengers', get_connection('titanic_db'))

        # Write that dataframe to disk for later. Called "caching" the data for later.
        df.to_file(filename)

        # Return the dataframe to the calling code
        return df

## Exercises


4. In a jupyter notebook, classification_exercises.ipynb, use a python module (pydata or seaborn datasets) containing datasets as a source from the iris data. Create a pandas dataframe, df_iris, from this data.
- print the first 3 rows
- print the number of rows and columns (shape)
- print the column names
- print the data type of each column
- print the summary statistics for each of the numeric variables

In [13]:
from pydataset import data
df_iris = data('iris')
df_iris.head(3)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa


In [None]:
#iris from SNS

In [14]:
df_iris.shape

(150, 5)

In [15]:
df_iris.columns

Index(['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width',
       'Species'],
      dtype='object')

In [16]:
df_iris.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150 entries, 1 to 150
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Sepal.Length  150 non-null    float64
 1   Sepal.Width   150 non-null    float64
 2   Petal.Length  150 non-null    float64
 3   Petal.Width   150 non-null    float64
 4   Species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 7.0+ KB


In [60]:
stats=df_iris.describe().T
stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Sepal.Length,150.0,5.843333,0.828066,4.3,5.1,5.8,6.4,7.9
Sepal.Width,150.0,3.057333,0.435866,2.0,2.8,3.0,3.3,4.4
Petal.Length,150.0,3.758,1.765298,1.0,1.6,4.35,5.1,6.9
Petal.Width,150.0,1.199333,0.762238,0.1,0.3,1.3,1.8,2.5


In [61]:
stats['range'] = stats.max()-stats.min()
stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,range
Sepal.Length,150.0,5.843333,0.828066,4.3,5.1,5.8,6.4,7.9,
Sepal.Width,150.0,3.057333,0.435866,2.0,2.8,3.0,3.3,4.4,
Petal.Length,150.0,3.758,1.765298,1.0,1.6,4.35,5.1,6.9,
Petal.Width,150.0,1.199333,0.762238,0.1,0.3,1.3,1.8,2.5,



5. Read the Table1_CustDetails table from your spreadsheet exercises google sheet into a dataframe named df_google_sheets.

### Make sure that the spreadsheet is publicly visible under your sharing settings.
- assign the first 100 rows to a new dataframe, df_google_sheets_sample
- print the number of rows of your original dataframe
- print the first 5 column names
- print the column names that have a data type of object
- compute the range for each of the numeric variables.

In [18]:
sheet_id='1p0E_0oLVJ3JeKNNarmlbtUFKPr71_ZO-Cvw5tpjV0LA'
sheet_name='Table1_CustDetails'
google_sheet_url=f'https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}'

In [24]:
df_cust = pd.read_csv(google_sheet_url)
df_cust.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7049 entries, 0 to 7048
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   customer_id        7049 non-null   object 
 1   gender             7049 non-null   object 
 2   is_senior_citizen  7049 non-null   int64  
 3   partner            7049 non-null   object 
 4   dependents         7049 non-null   object 
 5   phone_service      7049 non-null   int64  
 6   internet_service   7049 non-null   int64  
 7   contract_type      7049 non-null   int64  
 8   payment_type       7049 non-null   object 
 9   monthly_charges    7049 non-null   float64
 10  total_charges      7038 non-null   float64
 11  churn              7049 non-null   object 
 12  tenure             7049 non-null   float64
 13  Unnamed: 13        0 non-null      float64
dtypes: float64(4), int64(4), object(6)
memory usage: 771.1+ KB


In [25]:
df_cust= df_cust.drop(columns = ['Unnamed: 13'])
df_excel_sample = df_cust.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7049 entries, 0 to 7048
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   customer_id        7049 non-null   object 
 1   gender             7049 non-null   object 
 2   is_senior_citizen  7049 non-null   int64  
 3   partner            7049 non-null   object 
 4   dependents         7049 non-null   object 
 5   phone_service      7049 non-null   int64  
 6   internet_service   7049 non-null   int64  
 7   contract_type      7049 non-null   int64  
 8   payment_type       7049 non-null   object 
 9   monthly_charges    7049 non-null   float64
 10  total_charges      7038 non-null   float64
 11  churn              7049 non-null   object 
 12  tenure             7049 non-null   float64
dtypes: float64(3), int64(4), object(6)
memory usage: 716.0+ KB


In [26]:
# Cleaning NA out of data
df_cust = df_cust.dropna()
df_cust.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7038 entries, 0 to 7048
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   customer_id        7038 non-null   object 
 1   gender             7038 non-null   object 
 2   is_senior_citizen  7038 non-null   int64  
 3   partner            7038 non-null   object 
 4   dependents         7038 non-null   object 
 5   phone_service      7038 non-null   int64  
 6   internet_service   7038 non-null   int64  
 7   contract_type      7038 non-null   int64  
 8   payment_type       7038 non-null   object 
 9   monthly_charges    7038 non-null   float64
 10  total_charges      7038 non-null   float64
 11  churn              7038 non-null   object 
 12  tenure             7038 non-null   float64
dtypes: float64(3), int64(4), object(6)
memory usage: 769.8+ KB


In [28]:
# assign the first 100 rows to a new dataframe, df_google_sheets_sample
df_excel_sample = df_cust.head(100)

In [32]:
# print the number of rows of your original dataframe
df_cust.shape

(7038, 13)

In [33]:
# assign the first 100 rows to a new dataframe, df_google_sheets_sample
df_excel_sample.shape

(100, 13)

In [36]:
# print the first 5 column names
x=df_excel_sample.columns.to_list()
x[:5]

['customer_id', 'gender', 'is_senior_citizen', 'partner', 'dependents']

In [43]:
# print the column names that have a data type of object
list(df_excel_sample.select_dtypes('object'))


['customer_id', 'gender', 'partner', 'dependents', 'payment_type', 'churn']

In [44]:
# compute the range for each of the numeric variables.
y = df_excel_sample.select_dtypes('number')
y.max()-y.min()

is_senior_citizen       1.00
phone_service           2.00
internet_service        2.00
contract_type           2.00
monthly_charges        97.40
total_charges        8476.85
tenure                 73.20
dtype: float64

6. Read the data from [this google sheet](https://docs.google.com/spreadsheets/d/1Uhtml8KY19LILuZsrDtlsHHDC9wuDGUSe8LTEwvdI5g/edit?usp=sharing) into a dataframe, `df_google`

    - print the first 3 rows
    - print the number of rows and columns
    - print the column names
    - print the data type of each column
    - print the summary statistics for each of the numeric variables
    - print the unique values for each of your categorical variables

In [45]:
sheet_id='1Uhtml8KY19LILuZsrDtlsHHDC9wuDGUSe8LTEwvdI5g'
sheet_name='train'
google_sheet_url=f'https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}'

In [46]:
df_google = pd.read_csv(google_sheet_url)
df_google.shape

(891, 12)

In [47]:
# print the first 3 rows
df_google.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,,7.925,,S


In [49]:
# print the number of rows and columns
df_google.shape

(891, 12)

In [53]:
# print the column names
x=df_google.columns.to_list()
x

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [55]:
# print the data type of each column
df_google.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       661 non-null    float64
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(3), int64(5), object(4)
memory usage: 83.7+ KB


In [56]:
# print the summary statistics for each of the numeric variables
df_google.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Ticket,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,661.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,260318.5,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,471609.3,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,693.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,19996.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,236171.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,347743.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,3101298.0,512.3292


In [59]:
# print the unique values for each of your categorical variables
df_google.select_dtypes('object').apply(lambda col: col.unique().tolist())

Name        [Braund, Mr. Owen Harris, Cumings, Mrs. John Bradley (Florence Briggs Thayer), Heikkin...
Sex                                                                                    [male, female]
Cabin       [nan, C85, C123, E46, G6, C103, D56, A6, C23 C25 C27, B78, D33, B30, C52, B28, C83, F3...
Embarked                                                                               [S, C, Q, nan]
dtype: object

In [None]:
for col in df_google.columns:
    #print(col)
    

In [62]:
df_google.Pclass.value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

- Using the Iris Data:

1. Use the function defined in acquire.py to load the iris data.

2. Drop the species_id and measurement_id columns.

3. Rename the species_name column to just species.

4. Create dummy variables of the species name and concatenate onto the iris dataframe. (This is for practice, we don't always have to encode the target, but if we used species as a feature, we would need to encode it).

5. Create a function named prep_iris that accepts the untransformed iris data, and returns the data with the transformations above applied.



In [64]:
import acquire

In [69]:
iris = acquire.get_iris_data()
iris.head()

Using cached csv


Unnamed: 0,species_id,species_name,sepal_length,sepal_width,petal_length,petal_width
0,1,setosa,5.1,3.5,1.4,0.2
1,1,setosa,4.9,3.0,1.4,0.2
2,1,setosa,4.7,3.2,1.3,0.2
3,1,setosa,4.6,3.1,1.5,0.2
4,1,setosa,5.0,3.6,1.4,0.2


In [72]:
# Drop the species_id and measurement_id columns.
iris2=iris.drop(['species_id'], axis=1)
iris2.head()

Unnamed: 0,species_name,sepal_length,sepal_width,petal_length,petal_width
0,setosa,5.1,3.5,1.4,0.2
1,setosa,4.9,3.0,1.4,0.2
2,setosa,4.7,3.2,1.3,0.2
3,setosa,4.6,3.1,1.5,0.2
4,setosa,5.0,3.6,1.4,0.2


In [74]:
# Rename the species_name column to just species.
iris3 = iris2.rename({'species_name':'species'}, axis=1)
iris3.head()

Unnamed: 0,species,sepal_length,sepal_width,petal_length,petal_width
0,setosa,5.1,3.5,1.4,0.2
1,setosa,4.9,3.0,1.4,0.2
2,setosa,4.7,3.2,1.3,0.2
3,setosa,4.6,3.1,1.5,0.2
4,setosa,5.0,3.6,1.4,0.2


In [81]:
dummy_iris = pd.get_dummies(iris3[['species']], dummy_na=False, drop_first=[True])
dummy_iris.head()

Unnamed: 0,species_versicolor,species_virginica
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0


In [83]:
# Create dummy variables of the species name and concatenate onto the iris dataframe. (This is for practice, we don't always have to encode the target, but if we used species as a feature, we would need to encode it).
iris4 = pd.concat([iris3,dummy_iris], axis=1)
iris4.head()

Unnamed: 0,species,sepal_length,sepal_width,petal_length,petal_width,species_versicolor,species_virginica
0,setosa,5.1,3.5,1.4,0.2,0,0
1,setosa,4.9,3.0,1.4,0.2,0,0
2,setosa,4.7,3.2,1.3,0.2,0,0
3,setosa,4.6,3.1,1.5,0.2,0,0
4,setosa,5.0,3.6,1.4,0.2,0,0


In [159]:
#Create a function named prep_iris that accepts the untransformed iris data, and returns the data with the transformations above applied.
def prep_iris():
    iris = acquire.get_iris_data()
    iris2 =iris.drop(['species_id'], axis=1)
    iris3 = iris2.rename({'species_name':'species'}, axis=1)
    dummy_iris = pd.get_dummies(iris3[['species']], dummy_na=False, drop_first=[True])
    iris4 = pd.concat([iris3,dummy_iris], axis=1)
    return iris4

In [161]:
prep_iris().head()

Using cached csv


Unnamed: 0,species,sepal_length,sepal_width,petal_length,petal_width,species_versicolor,species_virginica
0,setosa,5.1,3.5,1.4,0.2,0,0
1,setosa,4.9,3.0,1.4,0.2,0,0
2,setosa,4.7,3.2,1.3,0.2,0,0
3,setosa,4.6,3.1,1.5,0.2,0,0
4,setosa,5.0,3.6,1.4,0.2,0,0


In [88]:
titanic = acquire.get_titanic_data()

Using cached csv


In [90]:
titanic.drop_duplicates()
titanic.drop_duplicates()
titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [91]:
titanic.isna().sum()

passenger_id      0
survived          0
pclass            0
sex               0
age             177
sibsp             0
parch             0
fare              0
embarked          2
class             0
deck            688
embark_town       2
alone             0
dtype: int64

In [153]:
# From lesson
def prep_titanic():
    '''
    This function will clean the data...
    '''
    titanic = acquire.get_titanic_data()
    titanic = titanic.drop_duplicates()
    cols_to_drop = ['deck', 'embarked', 'class', 'age']
    titanic2 = titanic.drop(columns=cols_to_drop)
    titanic2['embark_town'] = titanic2.embark_town.fillna(value='Southampton')
    dummy_df = pd.get_dummies(titanic2[['sex', 'embark_town']], dummy_na=False, drop_first=[True, True])
    titanic3 = pd.concat([titanic2, dummy_df], axis=1)
    print('Data cleaned for duplicates, columns dropped [deck, embarked, class, age], filled na, and added numerical versions of sex and embark')
    return titanic3

In [154]:
prep_titanic()

Using cached csv
Data cleaned for duplicates, columns dropped [deck, embarked, class, age], filled na, and added numerical versions of sex and embark


Unnamed: 0,passenger_id,survived,pclass,sex,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,0,3,male,1,0,7.2500,Southampton,0,1,0,1
1,1,1,1,female,1,0,71.2833,Cherbourg,0,0,0,0
2,2,1,3,female,0,0,7.9250,Southampton,1,0,0,1
3,3,1,1,female,1,0,53.1000,Southampton,0,0,0,1
4,4,0,3,male,0,0,8.0500,Southampton,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
886,886,0,2,male,0,0,13.0000,Southampton,1,1,0,1
887,887,1,1,female,0,0,30.0000,Southampton,1,0,0,1
888,888,0,3,female,1,2,23.4500,Southampton,0,0,0,1
889,889,1,1,male,0,0,30.0000,Cherbourg,1,1,0,0


- Using the Telco dataset

1. Use the function defined in acquire.py to load the Telco data.
2. Drop any unnecessary, unhelpful, or duplicated columns. This could mean dropping foreign key columns but keeping the corresponding string values, for example.
3. Encode the categorical columns. Create dummy variables of the categorical columns and concatenate them onto the dataframe.
4. Create a function named prep_telco that accepts the raw telco data, and returns the data with the transformations above applied.

In [151]:
def prep_telco():
    telco=acquire.get_telco_data()
    # Drop Duplicates
    telco.drop(columns=['payment_type_id','internet_service_type_id','contract_type_id','customer_id'], inplace=True)
    # Get rid of whitespace
    telco.total_charges.replace(to_replace=[' ',''],value = np.nan,inplace=True)
    telco = telco[telco.total_charges != '']
    # Convert to float
    telco['total_charges'] = telco.total_charges.astype(float)
    
    # adding categorical variables to numeric
    telco['gender_binary'] = telco.gender.map({'Female':1, 'Male': 0})
    telco['partner_binary'] = telco.partner.map({'Yes':1,'No': 0})
    telco['dependents_binary'] = telco.dependents.map({'Yes':1, 'No':0})
    telco['phone_service_binary'] = telco.phone_service.map({'Yes':1, 'No':0})
    telco['paperless_billing_encoded'] = telco.paperless_billing.map({'Yes': 1, 'No': 0})
    telco['churn_encoded'] = telco.churn.map({'Yes': 1, 'No': 0})
    
    return telco

In [152]:
prep_telco()

Using cached csv


Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,device_protection,...,churn,contract_type,internet_service_type,payment_type,gender_binary,partner_binary,dependents_binary,phone_service_binary,paperless_billing_encoded,churn_encoded
0,Female,0,Yes,Yes,9,Yes,No,No,Yes,No,...,No,One year,DSL,Mailed check,1,1,1,1,1,0
1,Male,0,No,No,9,Yes,Yes,No,No,No,...,No,Month-to-month,DSL,Mailed check,0,0,0,1,0,0
2,Male,0,No,No,4,Yes,No,No,No,Yes,...,Yes,Month-to-month,Fiber optic,Electronic check,0,0,0,1,1,1
3,Male,1,Yes,No,13,Yes,No,No,Yes,Yes,...,Yes,Month-to-month,Fiber optic,Electronic check,0,1,0,1,1,1
4,Female,1,Yes,No,3,Yes,No,No,No,No,...,Yes,Month-to-month,Fiber optic,Mailed check,1,1,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Female,0,No,No,13,Yes,No,Yes,No,No,...,No,One year,DSL,Mailed check,1,0,0,1,0,0
7039,Male,0,Yes,No,22,Yes,Yes,No,No,No,...,Yes,Month-to-month,Fiber optic,Electronic check,0,1,0,1,1,1
7040,Male,0,No,No,2,Yes,No,No,Yes,No,...,No,Month-to-month,DSL,Mailed check,0,0,0,1,1,0
7041,Male,0,Yes,Yes,67,Yes,No,Yes,No,Yes,...,No,Two year,DSL,Mailed check,0,1,1,1,0,0
