In [2]:
#eda - process of analysing data without use of stat testing procedures
#eda - display relationships among data to detect patterns

import pandas as pd
import numpy as np
college = pd.read_csv('data/college.csv')
college.head()
college.info

<bound method DataFrame.info of                                                  INSTNM              CITY  \
0                              Alabama A & M University            Normal   
1                   University of Alabama at Birmingham        Birmingham   
2                                    Amridge University        Montgomery   
3                   University of Alabama in Huntsville        Huntsville   
4                              Alabama State University        Montgomery   
5                             The University of Alabama        Tuscaloosa   
6                     Central Alabama Community College    Alexander City   
7                               Athens State University            Athens   
8                       Auburn University at Montgomery        Montgomery   
9                                     Auburn University            Auburn   
10                          Birmingham Southern College        Birmingham   
11               Chattahoochee Valley Commun

In [3]:
college.describe(include = [np.object, pd.Categorical]).T #stats for object and categorial columns

Unnamed: 0,count,unique,top,freq
INSTNM,7535,7535,Walsh University,1
CITY,7535,2514,New York,87
STABBR,7535,59,CA,773
MD_EARN_WNE_P10,6413,598,PrivacySuppressed,822
GRAD_DEBT_MDN_SUPP,7503,2038,PrivacySuppressed,1510


In [4]:
college.describe(include = [np.number]).T #np.number -> integer + floats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
HBCU,7164.0,0.014238,0.118478,0.0,0.0,0.0,0.0,1.0
MENONLY,7164.0,0.009213,0.095546,0.0,0.0,0.0,0.0,1.0
WOMENONLY,7164.0,0.005304,0.072642,0.0,0.0,0.0,0.0,1.0
RELAFFIL,7535.0,0.190975,0.393096,0.0,0.0,0.0,0.0,1.0
SATVRMID,1185.0,522.819409,68.578862,290.0,475.0,510.0,555.0,765.0
SATMTMID,1196.0,530.76505,73.469767,310.0,482.0,520.0,565.0,785.0
DISTANCEONLY,7164.0,0.005583,0.074519,0.0,0.0,0.0,0.0,1.0
UGDS,6874.0,2356.83794,5474.275871,0.0,117.0,412.5,1929.5,151558.0
UGDS_WHITE,6874.0,0.510207,0.286958,0.0,0.2675,0.5557,0.747875,1.0
UGDS_BLACK,6874.0,0.189997,0.224587,0.0,0.036125,0.10005,0.2577,1.0


In [5]:
#continous data - numeric / infinite number of possibilities
#categorical data - discrete / finite number of possibilities
college.describe(include = [np.number], percentiles=[0.01, .05, 0.09]).T

Unnamed: 0,count,mean,std,min,1%,5%,9%,50%,max
HBCU,7164.0,0.014238,0.118478,0.0,0.0,0.0,0.0,0.0,1.0
MENONLY,7164.0,0.009213,0.095546,0.0,0.0,0.0,0.0,0.0,1.0
WOMENONLY,7164.0,0.005304,0.072642,0.0,0.0,0.0,0.0,0.0,1.0
RELAFFIL,7535.0,0.190975,0.393096,0.0,0.0,0.0,0.0,0.0,1.0
SATVRMID,1185.0,522.819409,68.578862,290.0,390.0,430.0,445.0,510.0,765.0
SATMTMID,1196.0,530.76505,73.469767,310.0,395.0,430.0,450.0,520.0,785.0
DISTANCEONLY,7164.0,0.005583,0.074519,0.0,0.0,0.0,0.0,0.0,1.0
UGDS,6874.0,2356.83794,5474.275871,0.0,14.0,31.65,46.0,412.5,151558.0
UGDS_WHITE,6874.0,0.510207,0.286958,0.0,0.0,0.013265,0.053257,0.5557,1.0
UGDS_BLACK,6874.0,0.189997,0.224587,0.0,0.0,0.0,0.0045,0.10005,1.0


In [6]:
#data dictionary - table of metadata and notes on each column of data
college_data_dict = pd.read_csv('data/college_data_dictionary.csv')
college_data_dict.describe(include = [np.object, pd.Categorical]).T #stats for object and categorial columns
#college_data_dict.describe(include = [np.number], percentiles=[.01, .05, .10, .25, .5, .75, .9, .95, .99]).T

Unnamed: 0,count,unique,top,freq
column_name,27,27,UGDS,1
description,27,27,0/1 Men Only,1


In [7]:
#reducing memory
#pandas doesnt classify data as either categorial or numerical but it has precise technical definitions for many distinct data types

diff_cols = ['RELAFFIL', 'STAMTMID', 'CURROPER', 'INSTNM', 'STABBR']
new_col = college.loc[:, diff_cols]
#new_col.head()
new_col.dtypes

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


RELAFFIL      int64
STAMTMID    float64
CURROPER      int64
INSTNM       object
STABBR       object
dtype: object

In [8]:
#find memory use of each column
og_mem = new_col.memory_usage(deep = True)
og_mem

Index           80
RELAFFIL     60280
STAMTMID     60280
CURROPER     60280
INSTNM      660240
STABBR      444565
dtype: int64

In [9]:
#converting float to int
new_col['RELAFFIL'] = new_col['RELAFFIL'].astype(np.int8)
new_col.dtypes

RELAFFIL       int8
STAMTMID    float64
CURROPER      int64
INSTNM       object
STABBR       object
dtype: object

In [10]:
new_mem = new_col.memory_usage(deep = True) #extract exact amount of memory of ech object data type we set deep to true
new_mem

Index           80
RELAFFIL      7535
STAMTMID     60280
CURROPER     60280
INSTNM      660240
STABBR      444565
dtype: int64

In [11]:
difference_in_memory = new_mem // og_mem
difference_in_memory

Index       1
RELAFFIL    0
STAMTMID    1
CURROPER    1
INSTNM      1
STABBR      1
dtype: int64

In [12]:
#pandas defaults the ints and gloats to 64 bits regardless of max needed size for particular DF
'''Memory units displayed are in bytes and not bits. One byte is
equivalent to 8 bits, so when RELAFFIL was changed to an 8-bit integer, it
uses one 1 byte of memory and as there are 7,535 rows, its memory
footprint is equivalent to 7,535 bytes.'''
#if no index is specified during df creation - pandas defaults to RangeIndex

college.loc[0, 'CURROPER'] = 1000000
college.loc[0, 'INSTNM'] = college.loc[0, 'INSTNM'] + 'a'
college[['CURROPER', 'INSTNM']].memory_usage(deep = True)


Index           80
CURROPER     60280
INSTNM      660293
dtype: int64

In [15]:
movies = pd.read_csv('data/movie.csv')
movies_ = movies[['movie_title', 'imdb_score', 'budget']]
movies_.head()

Unnamed: 0,movie_title,imdb_score,budget
0,Avatar,7.9,237000000.0
1,Pirates of the Caribbean: At World's End,7.1,300000000.0
2,Spectre,6.8,245000000.0
3,The Dark Knight Rises,8.5,250000000.0
4,Star Wars: Episode VII - The Force Awakens,7.1,


In [16]:
movies_.nlargest(100, 'imdb_score').head()#selects n largsest elements from the df

Unnamed: 0,movie_title,imdb_score,budget
2725,Towering Inferno,9.5,
1920,The Shawshank Redemption,9.3,25000000.0
3402,The Godfather,9.2,6000000.0
2779,Dekalog,9.1,
4312,Kickboxer: Vengeance,9.1,17000000.0


In [17]:
movies_.nlargest(100, 'imdb_score').nsmallest(5, 'budget') #selects m smallest from n largest in df

Unnamed: 0,movie_title,imdb_score,budget
4804,Butterfly Girl,8.7,180000.0
4801,Children of Heaven,8.5,180000.0
4706,12 Angry Men,8.9,350000.0
4550,A Separation,8.4,500000.0
4636,The Other Dream Team,8.4,500000.0


In [19]:
#selecting largest of each group by sorting
movie = pd.read_csv('data/movie.csv')
movie2 = movie[['movie_title', 'imdb_score', 'title_year']]
movie2.sort_values('title_year', ascending = False).head()

Unnamed: 0,movie_title,imdb_score,title_year
3884,The Veil,4.7,2016.0
2375,My Big Fat Greek Wedding 2,6.1,2016.0
2794,Miracles from Heaven,6.8,2016.0
92,Independence Day: Resurgence,5.5,2016.0
153,Kung Fu Panda 3,7.2,2016.0


In [21]:
movie3 = movie2.sort_values(['title_year', 'imdb_score'], ascending = False)
movie3.head() #sorting on multi columns

Unnamed: 0,movie_title,imdb_score,title_year
4312,Kickboxer: Vengeance,9.1,2016.0
4277,A Beginner's Guide to Snuff,8.7,2016.0
3798,Airlift,8.5,2016.0
27,Captain America: Civil War,8.2,2016.0
98,Godzilla Resurgence,8.2,2016.0


In [23]:
movie_top_for_each_year = movie3.drop_duplicates(subset='title_year')
movie_top_for_each_year.head()

Unnamed: 0,movie_title,imdb_score,title_year
4312,Kickboxer: Vengeance,9.1,2016.0
3745,Running Forever,8.6,2015.0
4369,Queen of the Mountains,8.7,2014.0
3935,"Batman: The Dark Knight Returns, Part 2",8.4,2013.0
3,The Dark Knight Rises,8.5,2012.0


In [25]:
import pandas_datareader as pdr 
tsla = pdr.DataReader('tsla', data_source='google', start='2017-1-1')
tsla.head(10)

ImmediateDeprecationError: 
Google finance has been immediately deprecated due to large breaks in the API without the
introduction of a stable replacement. Pull Requests to re-enable these data
connectors are welcome.

See https://github.com/pydata/pandas-datareader/issues
