In [1]:
import pandas as pd
import numpy as np

### Step 2: Explore and Assess the Data
#### Explore and Clean Data
- Identify data quality issues, like missing values, duplicate data, etc.
- Document steps necessary to clean the data

### Functions to use later

In [2]:
#function to describe dataframe from step 1
def stats_on_df(df, name):
    print("\nThere are {} rows and {} columns of data in the {} file".format(len(df), len(df.columns), name))
    print("columns are {}".format(df.columns))

In [3]:
#function to compare series to identify similarity
def matcher(series1, series2):
    matches = [i for i in series1 if i in series2]
    print(matches)

## Airports
Let's start with the **airport** data set found in /data_first_cleaning/airports.csv 

In [4]:
airport_df = pd.read_csv("data_first_cleaning/airports.csv")
stats_on_df(airport_df,"airports")


There are 55075 rows and 12 columns of data in the airports file
columns are Index(['airport_identifier', 'airport_size', 'airport_name', 'elevation_ft',
       'continent', 'country', 'iso_region', 'municipality', 'gps_code',
       'iata_code', 'local_code', 'coordinates'],
      dtype='object')


In [5]:
airport_df.airport_size.unique()

array(['heliport', 'small_airport', 'closed', 'seaplane_base',
       'balloonport', 'medium_airport', 'large_airport'], dtype=object)

We will assume that only airports are relevant to immigration.

In [6]:
airports = ['small_airport', 'medium_airport', 'large_airport']
airport_df = airport_df[airport_df.airport_size.isin(airports)]
print("\nThere are {} rows of data in the airport file.".format(len(airport_df)))


There are 39142 rows of data in the airport file.


In [7]:
print("continents = {}".format(airport_df.continent.unique())) #has nan
print("countries = {}".format(airport_df.country.unique())) #no nan

continents = [nan 'OC' 'AF' 'AN' 'AS' 'SA' 'EU']
countries = ['US' 'PR' 'MH' 'SO' 'AQ' 'PG' 'SD' 'SA' 'AE' 'SS' 'CN' 'AF' 'SB' 'CO' 'AU'
 'MG' 'TD' 'AL' 'AM' 'MX' 'MZ' 'PW' 'NR' 'AO' 'AR' 'AS' 'GA' 'AZ' 'BA' 'BB'
 'BE' 'BF' 'BG' 'GL' 'BI' 'IS' 'BJ' 'OM' 'XK' 'KE' 'BO' 'BR' 'BS' 'CV' 'BW'
 'BY' 'UA' 'LR' 'BZ' 'CA' 'GB' 'CD' 'CF' 'CG' 'PH' 'MR' 'CH' 'CL' 'CM' 'MA'
 'CR' 'CU' 'CY' 'CZ' 'SK' 'PA' 'DZ' 'DE' 'ID' 'GH' 'RU' 'CI' 'DK' 'NG' 'NE'
 'TN' 'TG' 'EC' 'EE' 'FI' 'EG' 'GG' 'JE' 'IM' 'FK' 'EH' 'NL' 'IE' 'FO' 'LU'
 'NO' 'PL' 'ER' 'MN' 'ES' 'PT' 'SE' 'ET' 'LV' 'LT' 'ZA' 'SZ' 'GQ' 'SH' 'MU'
 'FJ' 'IO' 'ZM' 'FM' 'KM' 'YT' 'RE' 'TF' 'ST' 'FR' 'SC' 'ZW' 'MW' 'LS' nan
 'ML' 'GM' 'GE' 'GF' 'SL' 'GW' 'GN' 'SN' 'GR' 'GT' 'TZ' 'GY' 'SR' 'DJ' 'LY'
 'HN' 'VN' 'HR' 'KZ' 'RW' 'HT' 'HU' 'UG' 'TL' 'IL' 'IN' 'IQ' 'IR' 'JP' 'IT'
 'JM' 'JO' 'KG' 'KP' 'KR' 'MY' 'PM' 'SI' 'LK' 'MT' 'AT' 'RO' 'TR' 'MD' 'MK'
 'GI' 'RS' 'ME' 'TC' 'DO' 'MM' 'NI' 'SV' 'KY' 'NC' 'CK' 'TO' 'KI' 'TV' 'NU'
 'WF' 'NP' 'WS' 'PF' 'VU' 'N

In [8]:
nans_cont = len(airport_df[airport_df.continent.isna()])/len(airport_df)
print("{}% of the continent data is nan".format(round(nans_cont*100,0)))

45.0% of the continent data is nan


In [9]:
nan_countries = airport_df.country[airport_df.continent.isna()].unique()
notnan_countries = airport_df.country[airport_df.continent.notna()].unique()
matcher(nan_countries, notnan_countries)
#matches = [i for i in nan_countries if i in notnan_countries] #see if there is a country-based logic behind nans in continent data
#print(matches)

[]


Since there is no intersection between these 2 arrays, we conclude that continent being nan is directly based on specific countries. Let's add the continent for the countries with nan. Data is cross-referenced with https://www.nationsonline.org/oneworld/country_code_list.htm

In [10]:
nan_countries

array(['US', 'PR', 'MX', 'BB', 'GL', 'BS', 'BZ', 'CA', 'CR', 'CU', 'PA',
       'GT', 'HN', 'HT', 'JM', 'PM', 'TC', 'DO', 'NI', 'SV', 'KY', 'AG',
       'DM', 'GP', 'MQ', 'MF', 'BL', 'GD', 'VI', 'KN', 'LC', 'AW', 'BQ',
       'CW', 'SX', 'AI', 'MS', 'TT', 'VG', 'VC', 'BM'], dtype=object)

In [11]:
#north america countries in nan_countries array according to https://www.nationsonline.org
na_countries = ['US', 'PR', 'MX', 'BB', 'BS', 'BZ', 'CA', 'CR', 'CU', 'PA',
       'GT', 'HN', 'HT', 'JM', 'PM', 'TC', 'DO', 'NI', 'SV', 'KY', 'AG',
       'DM', 'GP', 'MQ', 'MF', 'BL', 'GD', 'VI', 'KN', 'LC', 'AW', 'AI', 'MS', 'TT', 'VG', 'VC', 'BM']
    
eu_countries = ['GL']

other_countries = ['BQ','CW', 'SX'] #will be dropped since they are not in the given source of country codes

In [12]:
airport_df.loc[airport_df['country'].isin(na_countries), 'continent'] = 'NA'
airport_df.loc[airport_df['country'].isin(eu_countries), 'continent'] = 'EU'
airport_df = airport_df[airport_df['continent'].notna()]
print("\nThere are {} rows of data in the airport file.".format(len(airport_df)))


There are 39137 rows of data in the airport file.


We have now cleaned the continent column and only lost 5 rows of data due to potentially obsolete information.

In [13]:
print(str(len(airport_df[airport_df.airport_identifier==airport_df.gps_code])) + " rows with airport_identifier=gps_code.")
print(str(len(airport_df[airport_df.gps_code==airport_df.local_code])) + " rows with gps_code=local_code.")
print(str(len(airport_df[airport_df.airport_identifier==airport_df.local_code])) + " rows with airport_identifier=local_code.")

29173 rows with airport_identifier=gps_code.
12305 rows with gps_code=local_code.
11725 rows with airport_identifier=local_code.


A lot of the data contains the same values for ident, gps_code, and local_code columns, but there is still a significant amount of data with different values in these columns therefore we will maintain all 3 columns columns.

In [14]:
airport_df.elevation_ft = pd.to_numeric(airport_df.elevation_ft, downcast='integer') #condense data
airport_df.head(2)
airport_df = airport_df[pd.notnull(airport_df.iata_code)]

In [15]:
airport_df.to_csv('data_second_cleaning/airports.csv', index=False) #save for to use in step 4

## Cities (Demographics)
Next we'll work with with the **cities** data set found in /data_first_cleaning/cities.csv 

In [16]:
cities_df = pd.read_csv("data_first_cleaning/cities.csv")
stats_on_df(cities_df,"cities")


There are 2891 rows and 12 columns of data in the cities file
columns are Index(['City', 'State', 'Median Age', 'Male Population', 'Female Population',
       'Total Population', 'Number of Veterans', 'Foreign-born',
       'Average Household Size', 'State Code', 'Race', 'Race Population'],
      dtype='object')


In [17]:
cities_df.head()

Unnamed: 0,City,State,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,State Code,Race,Race Population
0,Silver Spring,Maryland,33.8,40601.0,41862.0,82463,1562.0,30908.0,2.6,MD,Hispanic or Latino,25924
1,Quincy,Massachusetts,41.0,44129.0,49500.0,93629,4147.0,32935.0,2.39,MA,White,58723
2,Hoover,Alabama,38.5,38040.0,46799.0,84839,4819.0,8229.0,2.58,AL,Asian,4759
3,Rancho Cucamonga,California,34.5,88127.0,87105.0,175232,5821.0,33878.0,3.18,CA,Black or African-American,24437
4,Newark,New Jersey,34.6,138040.0,143873.0,281913,5829.0,86253.0,2.73,NJ,White,76402


In [18]:
cities_df.Race.unique()

array(['Hispanic or Latino', 'White', 'Asian', 'Black or African-American',
       'American Indian and Alaska Native'], dtype=object)

For our purposes, we do not want separate rows for each city. We will reshape our data first to have 1 row per city. Then, since our immigration data does not give us a destination city but rather a destination state. We will condense the data into 1 row per state. 

In [19]:
#cities_melted = pd.melt(cities_df, id_vars=['Race'], value_vars=['Race Population'])
#cities_grouped = cities_df.groupby(['Race']).aggregate('sum')
cities_index = list(cities_df.columns)
cities_index.remove('Race')
cities_index.remove('Race Population')
cities_pivot = pd.pivot_table(cities_df, values='Race Population', index=cities_index, columns=['Race'], aggfunc=np.sum)
cities_pivot = cities_pivot.reset_index()

In [20]:
cities_pivot.State = cities_pivot['State Code']
cities_pivot.head()

Race,City,State,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,State Code,American Indian and Alaska Native,Asian,Black or African-American,Hispanic or Latino,White
0,Abilene,TX,31.3,65212.0,60664.0,125876,9367.0,8129.0,2.64,TX,1813.0,2929.0,14449.0,33222.0,95487.0
1,Akron,OH,38.1,96886.0,100667.0,197553,12878.0,10024.0,2.24,OH,1845.0,9033.0,66551.0,3684.0,129192.0
2,Alafaya,FL,33.5,39504.0,45760.0,85264,4176.0,15842.0,2.94,FL,,10336.0,6577.0,34897.0,63666.0
3,Alameda,CA,41.4,37747.0,40867.0,78614,4504.0,18841.0,2.52,CA,1329.0,27984.0,7364.0,8265.0,44232.0
4,Albany,GA,33.3,31695.0,39414.0,71109,5409.0,861.0,2.38,GA,445.0,650.0,53440.0,1783.0,17160.0


In [21]:
states_pivot = cities_pivot.groupby(['State']).aggregate('sum')
cities_pivot2 = cities_pivot.loc[:,['State', 'Median Age', 'Average Household Size']]
states_pivot2 = cities_pivot2.groupby(['State']).aggregate('mean')
states_pivot.loc[:,['Median Age', 'Average Household Size']] = round(states_pivot2.loc[:,['Median Age', 'Average Household Size']],2)
sum(states_pivot['Total Population'])

115649397

Unfortunately, this data does not look to be complete as the US population is greater than 300 million. To address this, we will assume that the ratios are still valid and convert our data to percentages. We will convert total population to % of the entire country and the other population columns as % of the total for the specific state.

In [22]:
#convert population columns except total to % of the state
states_cols = list(states_pivot.columns)
states_cols.remove('Median Age')
states_cols.remove('Total Population')
states_cols.remove('Average Household Size')
for col in states_cols:
    states_pivot[col] = round(states_pivot[col]/states_pivot['Total Population'],3)*100
states_pivot.head()

Race,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,American Indian and Alaska Native,Asian,Black or African-American,Hispanic or Latino,White
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
AK,32.2,51.2,48.8,298695,9.2,11.1,2.77,12.2,12.3,7.7,9.1,71.2
AL,36.23,47.4,52.6,1049629,6.8,5.0,2.43,0.8,2.7,49.6,3.7,47.5
AR,32.77,48.6,51.4,589879,5.4,10.5,2.53,1.6,3.7,25.4,13.2,65.2
AZ,35.04,49.5,50.5,4499542,5.9,15.2,2.77,2.9,5.1,6.6,33.5,79.8
CA,36.18,49.5,50.5,24822460,3.7,30.0,3.1,1.6,18.3,8.2,39.7,60.0


In [23]:
#convert total population column to % of country total
states_pivot['Total Population'] = round(states_pivot['Total Population']/sum(states_pivot['Total Population']),3)*100

In [24]:
states_pivot.head()

Race,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,American Indian and Alaska Native,Asian,Black or African-American,Hispanic or Latino,White
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
AK,32.2,51.2,48.8,0.3,9.2,11.1,2.77,12.2,12.3,7.7,9.1,71.2
AL,36.23,47.4,52.6,0.9,6.8,5.0,2.43,0.8,2.7,49.6,3.7,47.5
AR,32.77,48.6,51.4,0.5,5.4,10.5,2.53,1.6,3.7,25.4,13.2,65.2
AZ,35.04,49.5,50.5,3.9,5.9,15.2,2.77,2.9,5.1,6.6,33.5,79.8
CA,36.18,49.5,50.5,21.5,3.7,30.0,3.1,1.6,18.3,8.2,39.7,60.0


In [25]:
states_pivot.to_csv('data_second_cleaning/states.csv', index=True) #save for to use in step 4

## Immigration Data
Let's explore the immigration sample data since it will be much easier to work with than the SAS data. This will allow us to identify the cleaning steps necessary on the larger set. 

In [26]:
imm_df = pd.read_csv("data_first_cleaning/immigration_sample.csv")
stats_on_df(imm_df,"immigration sample")


There are 1000 rows and 29 columns of data in the immigration sample file
columns are Index(['Unnamed: 0', 'cicid', 'i94yr', 'i94mon', 'i94cit', 'i94res', 'i94port',
       'arrdate', 'i94mode', 'i94addr', 'depdate', 'i94bir', 'i94visa',
       'count', 'dtadfile', 'visapost', 'occup', 'entdepa', 'entdepd',
       'entdepu', 'matflag', 'biryear', 'dtaddto', 'gender', 'insnum',
       'airline', 'admnum', 'fltno', 'visatype'],
      dtype='object')


In [27]:
print(len(imm_df['count'].unique()))

1


There is no purpose for the count column as every row contains an identical value. This will not be maintained in our data model. 

In [28]:
print(imm_df.i94mode.unique())

[ 1.  3.  2.  9.]


Numeric coding will be changed from 1 to air, 2 to sea, and 3 to land respectively. Then the air data can be joined with the airports.csv data to provided additional information. 

## Temperature Data
Let's explore the temperature data and see what cleaning is necessary.

In [29]:
temp_df_full = pd.read_csv('../../data2/GlobalLandTemperaturesByCity.csv') 
stats_on_df(temp_df_full, 'temperatures all cities')


There are 8599212 rows and 7 columns of data in the temperatures all cities file
columns are Index(['dt', 'AverageTemperature', 'AverageTemperatureUncertainty', 'City',
       'Country', 'Latitude', 'Longitude'],
      dtype='object')


With over 8 million rows, this data will be very difficult to work with in Pandas. For our purposes, we do not need such extensive historical data. We can quickly truncate the data into a more manageable size by only maintaining recent years.

In [30]:
temp_df_full['Year'] = temp_df_full['dt'].astype(str).str[:4].astype(int)
temp_df_full.head()
temp_recent = temp_df_full[temp_df_full['Year']>1990]

In [31]:
stats_on_df(temp_recent, 'recent temps')


There are 958230 rows and 8 columns of data in the recent temps file
columns are Index(['dt', 'AverageTemperature', 'AverageTemperatureUncertainty', 'City',
       'Country', 'Latitude', 'Longitude', 'Year'],
      dtype='object')


Now that our dataset is less than 1 million rows, we can use pandas much more efficiently. We would like to prep our data so that we can easy have the following fields via an SQL query When loading our data in the data pipeline:

- Highest monthly avg Temp (of most recent year)
- Lowest monthly avg Temp  (of most recent year)
- Temperature delta vs. 10 years ago (compare most recent year vs. 10 years prior)
- Temperature delta vs. 20 years ago (compare most recent year vs. 20 years prior)

Let's clean the data here so that we will be able to easily obtain the necessary fields in a SQL query in a later step. 

In [32]:
temp_recent.tail()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude,Year
8599207,2013-05-01,11.464,0.236,Zwolle,Netherlands,52.24N,5.26E,2013
8599208,2013-06-01,15.043,0.261,Zwolle,Netherlands,52.24N,5.26E,2013
8599209,2013-07-01,18.775,0.193,Zwolle,Netherlands,52.24N,5.26E,2013
8599210,2013-08-01,18.025,0.298,Zwolle,Netherlands,52.24N,5.26E,2013
8599211,2013-09-01,,,Zwolle,Netherlands,52.24N,5.26E,2013


In [33]:
years = [1993, 2003, 2013]
temp_recent = temp_recent[temp_recent.Year.isin(years)]
stats_on_df(temp_recent, 'recent temps')


There are 115830 rows and 8 columns of data in the recent temps file
columns are Index(['dt', 'AverageTemperature', 'AverageTemperatureUncertainty', 'City',
       'Country', 'Latitude', 'Longitude', 'Year'],
      dtype='object')


In [34]:
temp_recent[temp_recent['City']=="Detroit"].head(5)

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude,Year
1982569,1993-01-01,-2.738,0.173,Detroit,United States,42.59N,82.91W,1993
1982570,1993-02-01,-5.685,0.292,Detroit,United States,42.59N,82.91W,1993
1982571,1993-03-01,-0.172,0.378,Detroit,United States,42.59N,82.91W,1993
1982572,1993-04-01,7.35,0.222,Detroit,United States,42.59N,82.91W,1993
1982573,1993-05-01,14.436,0.302,Detroit,United States,42.59N,82.91W,1993


Here we can see that the temperature data has city and country, however our immigration data only gives us the intended state and not city for destination. Therefore we'll need to reference our cities dataset to get state for each city.

In [35]:
citystate_df = pd.read_csv('data_first_cleaning/cities.csv')
stats_on_df(citystate_df, 'city state reference')


There are 2891 rows and 12 columns of data in the city state reference file
columns are Index(['City', 'State', 'Median Age', 'Male Population', 'Female Population',
       'Total Population', 'Number of Veterans', 'Foreign-born',
       'Average Household Size', 'State Code', 'Race', 'Race Population'],
      dtype='object')


In [36]:
citystate_df = citystate_df[['City','State','Total Population']]
citystate_df = citystate_df.groupby(['City','State']).aggregate('max') #remove duplicates
citystate_df = citystate_df.reset_index()
citystate_df = citystate_df[['City','State']]
citystate_df[citystate_df.City=='Detroit']

Unnamed: 0,City,State
144,Detroit,Michigan


In [37]:
temp_joined = temp_recent.set_index('City').join(citystate_df.set_index('City'))
temp_joined = temp_joined.reset_index()

In [38]:
stats_on_df(temp_joined, 'cities')


There are 116919 rows and 9 columns of data in the cities file
columns are Index(['City', 'dt', 'AverageTemperature', 'AverageTemperatureUncertainty',
       'Country', 'Latitude', 'Longitude', 'Year', 'State'],
      dtype='object')


In [39]:
print(temp_joined.head())
temp_joined.loc[temp_joined.Country != 'United States','State'] = np.NaN
#df.loc[df.ID == 103, 'FirstName'] = "Matt"
temp_joined['Month'] = temp_joined['dt'].astype(str).str[5:7].astype(int)

del temp_joined['dt']

       City          dt  AverageTemperature  AverageTemperatureUncertainty  \
0  A Coruña  1993-01-01               9.833                          0.455   
1  A Coruña  1993-02-01               9.737                          0.659   
2  A Coruña  1993-03-01              11.353                          0.246   
3  A Coruña  1993-04-01              11.104                          0.234   
4  A Coruña  1993-05-01              13.724                          0.473   

  Country Latitude Longitude  Year State  
0   Spain   42.59N     8.73W  1993   NaN  
1   Spain   42.59N     8.73W  1993   NaN  
2   Spain   42.59N     8.73W  1993   NaN  
3   Spain   42.59N     8.73W  1993   NaN  
4   Spain   42.59N     8.73W  1993   NaN  


In [40]:
print(temp_joined[temp_joined.Country=='Mexico'].State.unique())

[nan]


In [41]:
temp_df = temp_joined.copy()
#Create min max and average data
groupby_cols = ['Latitude','Longitude','Year', 'Country']
temp_df['avg_yrly_temp']=round(temp_df.groupby(groupby_cols)['AverageTemperature'].transform('mean'),2)
temp_df['mnthly_high_temp']=round(temp_df.groupby(groupby_cols)['AverageTemperature'].transform('max'),2)
temp_df['mnthly_low_temp']=round(temp_df.groupby(groupby_cols)['AverageTemperature'].transform('min'),2)
temp_df = temp_df.groupby(groupby_cols).last()
temp_df.reset_index(inplace=True)
print(temp_df.head(3))

  Latitude Longitude  Year    Country       City  AverageTemperature  \
0    0.80N   103.66E  1993   Malaysia    Sekudai              26.447   
1    0.80N   103.66E  1993  Singapore  Singapore              26.447   
2    0.80N   103.66E  2003   Malaysia    Sekudai              26.406   

   AverageTemperatureUncertainty State  Month  avg_yrly_temp  \
0                          0.186   NaN     12          26.93   
1                          0.186   NaN     12          26.93   
2                          0.258   NaN     12          27.31   

   mnthly_high_temp  mnthly_low_temp  
0             27.95            26.19  
1             27.95            26.19  
2             28.57            26.41  


In [42]:
print(temp_df.Country.unique())
print(len(temp_df))

['Malaysia' 'Singapore' 'Indonesia' 'Congo (Democratic Republic Of The)'
 'Uganda' 'Kenya' 'Somalia' 'Brazil' 'Colombia' 'Ecuador' 'Gabon'
 'Tanzania' 'Nigeria' 'Vietnam' 'Philippines' 'Guinea' 'Cameroon' 'Benin'
 'Burkina Faso' 'Ethiopia' 'Mali' 'Venezuela' 'India' 'Costa Rica' 'Peru'
 'Thailand' 'Cambodia' 'Guinea Bissau' 'Senegal' 'Djibouti' 'Nicaragua'
 'Burma' 'Angola' 'Zambia' 'Gambia' 'Niger' 'Sudan' 'Yemen' 'Honduras'
 'El Salvador' 'Malawi' 'Mozambique' 'Eritrea' 'Guatemala' 'Mexico'
 'Madagascar' 'Saudi Arabia' 'Oman' 'Australia' 'Bolivia' 'Laos'
 'Mauritania' 'Puerto Rico' 'Dominican Republic' 'Haiti' 'Jamaica'
 'Zimbabwe' 'Chile' 'Equatorial Guinea' 'Rwanda' 'China' 'Cuba' 'Mauritius'
 'Reunion' 'Hong Kong' 'Taiwan' 'Egypt' 'Bangladesh' 'Namibia'
 'South Africa' 'Qatar' 'United Arab Emirates' 'Pakistan' 'Bahamas'
 'Botswana' 'Paraguay' 'Argentina' 'Libya' 'Bahrain' 'Iran' 'United States'
 'Nepal' 'Swaziland' 'Spain' 'Morocco' 'Lesotho' 'Japan' 'Israel' 'Algeria'
 'Afghanist

In [43]:
#Collapse on year
temp_index = list(temp_df.columns)
temp_index.remove('avg_yrly_temp')
temp_index.remove('Year')
temp_df = temp_df.fillna('dummy') #otherwise it drops countries due to State=NaN
temp_pivot = pd.pivot_table(temp_df, values='avg_yrly_temp', index=temp_index, columns=['Year'], aggfunc=np.sum)
temp_pivot = temp_pivot.reset_index()
temp_pivot.columns = temp_pivot.columns.astype(str)
temp_pivot['State'][temp_pivot.State=='dummy'] = np.nan #revert to nan

#get delta values
temp_pivot[['1993','2003','2013']] = temp_pivot[['1993','2003','2013']].fillna(method='ffill')
temp_pivot = temp_pivot.iloc[2::3, :] #only keep 2013 rows
temp_pivot['temp_delta_10_yrs'] = temp_pivot['2013']-temp_pivot['2003']
temp_pivot['temp_delta_20_yrs'] = temp_pivot['2013']-temp_pivot['1993']
temp_pivot.reset_index()

#limit to relevant columns
temp_columns = ['City', 'Country', 'State','avg_yrly_temp', 'mnthly_high_temp', 'mnthly_low_temp', 'temp_delta_10_yrs', 'temp_delta_20_yrs', 'Latitude','Longitude']
temp_pivot = temp_pivot.rename(columns={'AverageTemperature':'avg_yrly_temp'})
temp_pivot = temp_pivot.rename_axis(None, axis = 1)
temp_pivot = temp_pivot.reset_index()
temp_pivot = temp_pivot[temp_columns]

temp_pivot.head(6)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Unnamed: 0,City,Country,State,avg_yrly_temp,mnthly_high_temp,mnthly_low_temp,temp_delta_10_yrs,temp_delta_20_yrs,Latitude,Longitude
0,Sekudai,Malaysia,,27.372,28.66,26.56,0.29,0.67,0.80N,103.66E
1,Singapore,Singapore,,27.372,28.66,26.56,0.29,0.67,0.80N,103.66E
2,Singkawang,Indonesia,,27.684,29.23,27.13,0.35,0.59,0.80N,108.48E
3,Kuching,Malaysia,,27.001,27.88,26.47,0.35,0.58,0.80N,110.09E
4,Bontang,Indonesia,,27.212,27.65,26.38,0.05,0.31,0.80N,118.13E
5,Gorontalo,Indonesia,,26.652,26.95,25.98,-0.2,0.19,0.80N,122.95E


In [44]:
temp_pivot.to_csv('data_second_cleaning/temperature.csv', index=False) #save for to use in step 4

This new temperature dataset is less than 1MB whereas we started with a dataset >500MB.