# Imports

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Bring in Data

In [2]:
bexar = pd.read_csv("bexar_county_temp_data.csv")

# Clean up Data

In [3]:
bexar.drop(columns='Unnamed: 0', inplace=True)

# Split the Original Dataframe into Dataframes Based on the Dtype Column

In [4]:
bexar_min = bexar.loc[bexar['dtype'] == 'min']
bexar_max = bexar.loc[bexar['dtype'] == 'max']
bexar_avg = bexar.loc[bexar['dtype'] == 'avg']

# Remove all columns not needed for the analysis

In [5]:
bexar_max.drop(columns=['county_name','dtype'], inplace=True)
bexar_min.drop(columns=['county_name','dtype'], inplace=True)
bexar_avg.drop(columns=['county_name','dtype'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


# Melt all the DFs
This is needed because the dataframes are currently split into seperate columns by month and this will make the analysis more difficult in terms of data manipulation

In [6]:
melt_max = pd.melt(bexar_max, id_vars=['year'], var_name='month', value_name='max_temp')
melt_min = pd.melt(bexar_min, id_vars=['year'], var_name='month', value_name='min_temp')
melt_avg = pd.melt(bexar_avg, id_vars=['year'], var_name='month', value_name='avg_temp')

# Convert the months from three letter format to two number format this will make it easier to make a date column by combining it with the year

In [7]:
melt_max.replace({'month' : { 
    'jan' : '01',
    'feb' : '02',
    'mar' : '03',
    'apr': '04',
    'may': '05',
    'jun': '06',
    'jul': '07',
    'aug': '08',
    'sep': '09',
    'oct': '10',
    'nov': '11',
    'dec': '12'
}}, inplace=True)
melt_min.replace({'month' : { 
    'jan' : '01',
    'feb' : '02',
    'mar' : '03',
    'apr': '04',
    'may': '05',
    'jun': '06',
    'jul': '07',
    'aug': '08',
    'sep': '09',
    'oct': '10',
    'nov': '11',
    'dec': '12'
}}, inplace=True)
melt_avg.replace({'month' : { 
    'jan' : '01',
    'feb' : '02',
    'mar' : '03',
    'apr': '04',
    'may': '05',
    'jun': '06',
    'jul': '07',
    'aug': '08',
    'sep': '09',
    'oct': '10',
    'nov': '11',
    'dec': '12'
}}, inplace=True)

# Sort the rows by year and month so that they are ordered chronologically

In [8]:
melt_max.sort_values(by=['year','month'], inplace=True)
melt_min.sort_values(by=['year','month'], inplace=True)
melt_avg.sort_values(by=['year','month'], inplace=True)

# Combine the year and month columns

In [9]:
melt_max["date"] = melt_max["year"].map(str) + '/' +melt_max["month"]
melt_min["date"] = melt_min["year"].map(str) + '/' +melt_min["month"]
melt_avg["date"] = melt_avg["year"].map(str) + '/' +melt_avg["month"]

# Convert the new date column into a datetime type

In [10]:
melt_max["date"] = pd.to_datetime(melt_max['date'], format='%Y-%m')
melt_min["date"] = pd.to_datetime(melt_min['date'], format='%Y-%m')
melt_avg["date"] = pd.to_datetime(melt_avg['date'], format='%Y-%m')

# Drop the old columns

In [11]:
melt_max.drop(columns=['year','month'], inplace=True)
melt_min.drop(columns=['year','month'], inplace=True)
melt_avg.drop(columns=['year','month'], inplace=True)

# Set the date column to the index

In [12]:
melt_max = melt_max.set_index('date')
melt_min = melt_min.set_index('date')
melt_avg = melt_avg.set_index('date')

# Drop the NaN's from the dataframe since the only NaN values are from future dates which have yet to be recorded

In [13]:
melt_max.dropna(inplace=True)
melt_min.dropna(inplace=True)
melt_avg.dropna(inplace=True)

# Merge all the dataframes together

In [14]:
temp = pd.merge(melt_max, melt_min ,on='date')

In [15]:
temperature_data_cleaned_and_prepped = pd.merge(temp, melt_avg, on='date')

# Data is now cleaned and prepped

In [16]:
temperature_data_cleaned_and_prepped.head()

Unnamed: 0_level_0,max_temp,min_temp,avg_temp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1895-01-01,24.96,4.48,14.72
1895-02-01,27.88,2.25,15.06
1895-03-01,35.51,11.12,23.31
1895-04-01,55.49,26.22,40.86
1895-05-01,57.78,31.86,44.82


# Turn the final dataframe into a csv

In [17]:
# temperature_data_cleaned_and_prepped.to_csv(r'/Users/codywatson/desktop/water_conservation_proposal/temperature_data_cleaned_and_prepped.csv')