In [32]:
import pandas as pd
import numpy as np

schema_df = pd.read_csv('data/survey_results_schema.csv', index_col= 'Column')

In [33]:
#set and reset display options

pd.set_option('display.max_rows', 61)
pd.set_option('display.max_columns', 61)
pd.set_option('display.max_colwidth', None)
pd.reset_option('display.max_colwidth')

In [34]:
#schema_df.reset_index(inplace=True)

schema_df

Unnamed: 0_level_0,QuestionText
Column,Unnamed: 1_level_1
Respondent,Randomized respondent ID number (not in order ...
MainBranch,Which of the following options best describes ...
Hobbyist,Do you code as a hobby?
Age,What is your age (in years)? If you prefer not...
Age1stCode,At what age did you write your first line of c...
CompFreq,"Is that compensation weekly, monthly, or yearly?"
CompTotal,What is your current total compensation (salar...
ConvertedComp,Salary converted to annual USD salaries using ...
Country,Where do you live?
CurrencyDesc,Which currency do you use day-to-day? If your ...


In [35]:
schema_df.loc['ConvertedComp']

QuestionText    Salary converted to annual USD salaries using ...
Name: ConvertedComp, dtype: object

In [36]:
#access row and column
schema_df.loc['ConvertedComp','QuestionText']

'Salary converted to annual USD salaries using the exchange rate on 2020-02-19, assuming 12 working months and 50 working weeks.'

In [37]:
schema_df.sort_index(inplace=True)

In [38]:
schema_df

Unnamed: 0_level_0,QuestionText
Column,Unnamed: 1_level_1
Age,What is your age (in years)? If you prefer not...
Age1stCode,At what age did you write your first line of c...
CompFreq,"Is that compensation weekly, monthly, or yearly?"
CompTotal,What is your current total compensation (salar...
ConvertedComp,Salary converted to annual USD salaries using ...
Country,Where do you live?
CurrencyDesc,Which currency do you use day-to-day? If your ...
CurrencySymbol,Which currency do you use day-to-day? If your ...
DatabaseDesireNextYear,Which database environments have you done exte...
DatabaseWorkedWith,Which database environments have you done exte...


# Basics of DataFrame

In [39]:
#create dataframe from dict

my_dict = {'rows': ['first','second','third','fourth','fifth','sixth','seventh'],
           'name': ['jeff','peter','julia', 'lea','renekton','nasus','azir'],
           'age':   [24,53,22,24,98,988,2353],
           'gender':[0,0,1,1,0,0,0] }

df = pd.DataFrame(my_dict)
df.set_index('rows', inplace=True)

In [40]:
#add columns

df['city'] = ['Berlin', 'Amsterdam', 'Asthana', 'Singapur','Shurima', 'Shurima', 'Shurima']
df['last_column'] = [x*x+2 for x in range(7)]

In [41]:
df.shape

(7, 5)

In [42]:
#check your df

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7 entries, first to seventh
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         7 non-null      object
 1   age          7 non-null      int64 
 2   gender       7 non-null      int64 
 3   city         7 non-null      object
 4   last_column  7 non-null      int64 
dtypes: int64(3), object(2)
memory usage: 336.0+ bytes


In [43]:
#look at your df
df

Unnamed: 0_level_0,name,age,gender,city,last_column
rows,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
first,jeff,24,0,Berlin,2
second,peter,53,0,Amsterdam,3
third,julia,22,1,Asthana,6
fourth,lea,24,1,Singapur,11
fifth,renekton,98,0,Shurima,18
sixth,nasus,988,0,Shurima,27
seventh,azir,2353,0,Shurima,38


In [44]:
df.columns

Index(['name', 'age', 'gender', 'city', 'last_column'], dtype='object')

In [45]:
#access row (returns row as a series object)
print(df.iloc[0])

#access row and column
df.iloc[0,0]

name             jeff
age                24
gender              0
city           Berlin
last_column         2
Name: first, dtype: object


'jeff'

In [46]:
#returns selected rows

df.iloc[[1,3,4]]

Unnamed: 0_level_0,name,age,gender,city,last_column
rows,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
second,peter,53,0,Amsterdam,3
fourth,lea,24,1,Singapur,11
fifth,renekton,98,0,Shurima,18


In [47]:
#returns selected rows by using string slicing

df.iloc[1:-2]

Unnamed: 0_level_0,name,age,gender,city,last_column
rows,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
second,peter,53,0,Amsterdam,3
third,julia,22,1,Asthana,6
fourth,lea,24,1,Singapur,11
fifth,renekton,98,0,Shurima,18


In [48]:
#returns selected rows and columns

df.iloc[[1,2,3],[0,-1]]

Unnamed: 0_level_0,name,last_column
rows,Unnamed: 1_level_1,Unnamed: 2_level_1
second,peter,3
third,julia,6
fourth,lea,11


In [49]:
#df[column_name] or df.column_name both return the selected column as series

type(df['name']), type(df.name)

(pandas.core.series.Series, pandas.core.series.Series)

In [50]:
#get the name values of the second and third row using index name or list indexing

df['name'][['second','third']]
df['name'][[1,2]]

rows
second    peter
third     julia
Name: name, dtype: object

In [51]:
#select multiple columns (returns your dataframe containing all rows but only selected columns)

df[['name','age']]

Unnamed: 0_level_0,name,age
rows,Unnamed: 1_level_1,Unnamed: 2_level_1
first,jeff,24
second,peter,53
third,julia,22
fourth,lea,24
fifth,renekton,98
sixth,nasus,988
seventh,azir,2353


In [52]:
#access row by index value

print(df.loc['second'])

#returns the value of the age column of the second row
df.loc['second','age']

name               peter
age                   53
gender                 0
city           Amsterdam
last_column            3
Name: second, dtype: object


53

In [53]:
#returns selected rows by index value and columns

df.loc[['second', 'third'],['city','age']]

Unnamed: 0_level_0,city,age
rows,Unnamed: 1_level_1,Unnamed: 2_level_1
second,Amsterdam,53
third,Asthana,22


In [54]:
#add index from a list or generator (overrides latest index)

#df.index = range(1, len(df)+1)
#df.index = ['1','2','3','4','5','6','7']

In [55]:
#basic filter

filter1 = (df['age'] > 52) & (df['age'] < 989) 

x = df[filter1]

len(x)

3

In [56]:
x

Unnamed: 0_level_0,name,age,gender,city,last_column
rows,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
second,peter,53,0,Amsterdam,3
fifth,renekton,98,0,Shurima,18
sixth,nasus,988,0,Shurima,27


In [57]:
df['city'].value_counts()

Shurima      3
Berlin       1
Amsterdam    1
Asthana      1
Singapur     1
Name: city, dtype: int64

In [58]:
df.loc['second':'fifth','city':'last_column']

Unnamed: 0_level_0,city,last_column
rows,Unnamed: 1_level_1,Unnamed: 2_level_1
second,Amsterdam,3
third,Asthana,6
fourth,Singapur,11
fifth,Shurima,18
