# Acquisition and Prep

## Step 1
- Acquire customer_id, monthly_charges, tenure, and total_charges from telco_churn database for all customers with a 2 year contract.

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from env import url

In [4]:

sql = '''SELECT customer_id, monthly_charges, tenure, total_charges 
FROM customers
WHERE contract_type_id = 3'''

df = pd.read_sql(sql,url("telco_churn"))

df.head()

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges
0,0013-SMEOE,109.7,71,7904.25
1,0014-BMAQU,84.65,63,5377.8
2,0016-QLJIS,90.45,65,5957.9
3,0017-DINOC,45.2,54,2460.55
4,0017-IUDMW,116.8,72,8456.75


## Step 2
- Walk through the steps above using your new dataframe. You may handle the missing values however you feel is appropriate.

In [5]:
df.shape

(1695, 4)

In [6]:
df.describe()

Unnamed: 0,monthly_charges,tenure
count,1695.0,1695.0
mean,60.770413,56.735103
std,34.678865,18.209363
min,18.4,0.0
25%,24.025,48.0
50%,64.35,64.0
75%,90.45,71.0
max,118.75,72.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1695 entries, 0 to 1694
Data columns (total 4 columns):
customer_id        1695 non-null object
monthly_charges    1695 non-null float64
tenure             1695 non-null int64
total_charges      1695 non-null object
dtypes: float64(1), int64(1), object(2)
memory usage: 53.1+ KB


In [8]:
df.total_charges.value_counts()

           10
1161.75     2
5714.2      2
343.45      2
844.45      2
           ..
5957.9      1
4819.75     1
107.6       1
5538.35     1
5611.75     1
Name: total_charges, Length: 1678, dtype: int64

In [9]:
df.total_charges = df.total_charges.str.strip()

In [10]:
df = df[df.total_charges != '']

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1685 entries, 0 to 1694
Data columns (total 4 columns):
customer_id        1685 non-null object
monthly_charges    1685 non-null float64
tenure             1685 non-null int64
total_charges      1685 non-null object
dtypes: float64(1), int64(1), object(2)
memory usage: 65.8+ KB


In [12]:
df.total_charges = df.total_charges.astype('float')

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1685 entries, 0 to 1694
Data columns (total 4 columns):
customer_id        1685 non-null object
monthly_charges    1685 non-null float64
tenure             1685 non-null int64
total_charges      1685 non-null float64
dtypes: float64(2), int64(1), object(1)
memory usage: 65.8+ KB


## Step 3
- End with a python file wrangle.py that contains the function, wrangle_telco(), that will acquire the data and return a dataframe cleaned with no missing values.

In [15]:
def wrangle_telco():
    sql = '''SELECT customer_id, monthly_charges, tenure, total_charges 
    FROM customers
    WHERE contract_type_id = 3'''
    df = pd.read_sql(sql, url("telco_churn"))
    df.total_charges = df.total_charges.str.strip()
    df = df[df.total_charges != '']
    df.total_charges = df.total_charges.astype('float')
    return df

In [16]:
df = wrangle_telco()
df.head()

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges
0,0013-SMEOE,109.7,71,7904.25
1,0014-BMAQU,84.65,63,5377.8
2,0016-QLJIS,90.45,65,5957.9
3,0017-DINOC,45.2,54,2460.55
4,0017-IUDMW,116.8,72,8456.75
