### Creating, Reading and Writing

In [77]:
# To use pandas

import pandas as pd

In [78]:
pd.DataFrame({'Yes': [50, 21], 'No': [131, 2]})

Unnamed: 0,Yes,No
0,50,131
1,21,2


- DataFrame is a table. It contains an array of individual entries, each of which has a certain value. Each entry corresponds to a row (or record) amd a column.

In [79]:
pd.DataFrame({'Yes': [50, 21], 'No': [131, 2]})

Unnamed: 0,Yes,No
0,50,131
1,21,2


In [80]:
pd.DataFrame({'Bob': ['I liked it.', 'It was awful.'], 'Sue':['Pretty good', 'Bland.']})

Unnamed: 0,Bob,Sue
0,I liked it.,Pretty good
1,It was awful.,Bland.


In [81]:
# The list of row labels used in a DataFrame is known as an index

pd.DataFrame({'Bob': ['I liked it.', 'It was awful'],
              'Sue': ['Pretty good', 'Bland']},
             index=['Product A', 'Product B'])

Unnamed: 0,Bob,Sue
Product A,I liked it.,Pretty good
Product B,It was awful,Bland


- A Series is a sequence of data values. If a DataFrame is a table, a Series is a list.

In [82]:
pd.Series([1,2,3,4,5])

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [83]:
# A Series is a single column of a DataFrame.

pd.Series([30, 35, 40], index = ['2015 Sales', '2016 Sales', '2017 Sales'], name='Product A')

2015 Sales    30
2016 Sales    35
2017 Sales    40
Name: Product A, dtype: int64

- CSV file is a table of values separated by commas. (Comma-Separated Values)

In [84]:
finance_reviews = pd.read_csv('C:/Users/pps/Desktop/배정환(인턴)/finance.csv', encoding='euc-kr')

In [85]:
# DataFrame has 100 records split across 3 different columns. 300 entries

finance_reviews.shape

(100, 3)

In [86]:
# DataFrame using the head() command, which grabs the first five rows:
# But right now we have 3 different columns

finance_reviews.head()

Unnamed: 0,나이,학력수준,소득
0,54,14,66814.19458
1,40,12,42144.33812
2,35,14,25697.76715
3,55,12,35976.874
4,40,12,39060.60606


In [87]:
finance_reviews = pd.read_csv('C:/Users/pps/Desktop/배정환(인턴)/finance.csv', encoding='euc-kr', index_col=0)
finance_reviews.head()

Unnamed: 0_level_0,학력수준,소득
나이,Unnamed: 1_level_1,Unnamed: 2_level_1
54,14,66814.19458
40,12,42144.33812
35,14,25697.76715
55,12,35976.874
40,12,39060.60606


### Indexing, Selecting & Assigning

In [88]:
# Showing the data max rows to 5, which is showing short.

pd.set_option('display.max_rows', 5)

In [89]:
finance_reviews

Unnamed: 0_level_0,학력수준,소득
나이,Unnamed: 1_level_1,Unnamed: 2_level_1
54,14,66814.19458
40,12,42144.33812
...,...,...
30,14,52423.44498
28,12,37004.78469


In [90]:
# Access to property of an object by accessing it as an attribute.
finance_reviews.학력수준

나이
54    14
40    12
      ..
30    14
28    12
Name: 학력수준, Length: 100, dtype: int64

In [91]:
finance_reviews['학력수준']

나이
54    14
40    12
      ..
30    14
28    12
Name: 학력수준, Length: 100, dtype: int64

In [92]:
# Selecting a specific Series out of a DataFrame.
finance_reviews['학력수준'][54]

14

In [93]:
# Indexing in pandas
# Index-based selection : selecting data based on its numerical position in the data (iloc)
# To select the first row of data in a DataFrame
# I should've write finance_reviews.iloc[0] but the age starts with 54.

finance_reviews.iloc[54]

학력수준        12.0000
소득      169605.2632
Name: 50, dtype: float64

In [94]:
# Both loc and iloc are row-first, column-second. Which is different with python as column-first, row-second
# It means it is easy to retrieve rows, and hard to get retrieve columns
# To get a column with iloc

finance_reviews.iloc[:, 0]

나이
54    14
40    12
      ..
30    14
28    12
Name: 학력수준, Length: 100, dtype: int64

In [95]:
# : operator can be used to indicate a range of values

finance_reviews.iloc[:3,0]

나이
54    14
40    12
35    14
Name: 학력수준, dtype: int64

In [96]:
# Or, to select just the second and third entries

finance_reviews.iloc[1:3,0]

나이
40    12
35    14
Name: 학력수준, dtype: int64

In [97]:
# pass a list

finance_reviews.iloc[[0,1,2], 0]

나이
54    14
40    12
35    14
Name: 학력수준, dtype: int64

In [98]:
# Negative numbers can be used
# This will start counting forwards from the end of the values

finance_reviews.iloc[-5:]

Unnamed: 0_level_0,학력수준,소득
나이,Unnamed: 1_level_1,Unnamed: 2_level_1
25,14,4625.598086
50,17,50367.6236
67,14,66814.19458
30,14,52423.44498
28,12,37004.78469


In [99]:
# Label-based selection : data index value, not its position

finance_reviews.loc[54, '학력수준']

14.0

In [100]:
# loc : It allows to access rows and columns using their labels (indices and column names)
# When your dataset has meaningful indices (like dates, names, categories), using 'loc' makes operations more intuitive because you can directly use the labels.

finance_reviews.loc[:, ['학력수준', '소득']]

Unnamed: 0_level_0,학력수준,소득
나이,Unnamed: 1_level_1,Unnamed: 2_level_1
54,14,66814.19458
40,12,42144.33812
...,...,...
30,14,52423.44498
28,12,37004.78469


In [101]:
# Manipulating the index
# set_index() :  manipulate the index

finance_reviews.set_index("소득")

Unnamed: 0_level_0,학력수준
소득,Unnamed: 1_level_1
66814.19458,14
42144.33812,12
...,...
52423.44498,14
37004.78469,12


In [102]:
# Conditional selection

finance_reviews.학력수준 == 14

나이
54     True
40    False
      ...  
30     True
28    False
Name: 학력수준, Length: 100, dtype: bool

In [103]:
# We can find that how many person's 학력 is 14
finance_reviews.loc[finance_reviews.학력수준 == 14]

Unnamed: 0_level_0,학력수준,소득
나이,Unnamed: 1_level_1,Unnamed: 2_level_1
54,14,66814.19458
35,14,25697.76715
...,...,...
67,14,66814.19458
30,14,52423.44498


In [104]:
# Using & (ampersand) bring 2 questions together
# if the 학력수준 is 14 and 소득 is >= than 55000

finance_reviews.loc[(finance_reviews.학력수준 == 14) & (finance_reviews.소득 >= 55000)]

Unnamed: 0_level_0,학력수준,소득
나이,Unnamed: 1_level_1,Unnamed: 2_level_1
54,14,66814.19458
40,14,71953.74801
...,...,...
44,14,177828.54860
67,14,66814.19458


In [105]:
# If the 학력수준 is 14 or 소득 is >= 55000

finance_reviews.loc[(finance_reviews.학력수준 == 14) | (finance_reviews.소득 >= 55000)]

Unnamed: 0_level_0,학력수준,소득
나이,Unnamed: 1_level_1,Unnamed: 2_level_1
54,14,66814.19458
35,14,25697.76715
...,...,...
67,14,66814.19458
30,14,52423.44498


In [106]:
# Built-in conditional selectors
# isin : Select the data whose value "is in" a list of values

finance_reviews.loc[finance_reviews.학력수준.isin([14, 12])]

Unnamed: 0_level_0,학력수준,소득
나이,Unnamed: 1_level_1,Unnamed: 2_level_1
54,14,66814.19458
40,12,42144.33812
...,...,...
30,14,52423.44498
28,12,37004.78469


#### Example of isnull() and notnull()

In [107]:
# isnull (companion with notnull) : Highlight values which are (or are not) empty (NaN)
# In pandas, they initialize the missing value with 'NaN', also 'None'
# isnull() : if the value is NaN, it is True if the value is not NaN, it is False
# notnull() : if the value is NaN, it is False if the value is not NaN, it is True (It is opposite with isnull())

# finance_reviews.loc[finance_reviews.학력수준.notnull()] # It is not collecting missing value

# import pandas as pd
# from pandas import DataFrame

# df_left = DataFrame({'KEY':['K0','K1','K2','K3'], 'A': ['A0','A1','A2','A3'], 'B': [0.5, 2.2, 3.6, 0.4]})
# df_right = DataFrame({'KEY': ['K2', 'K3', 'K4', 'K5'], 'C': ['C2', 'C3', 'C4', 'C5'], 'D': ['D2', 'D3', 'D4', 'D5']})

# df_all = pd.merge(df_left, df_right, how='outer', on='KEY')

# df_all

In [108]:
# df_all.loc[df_all.A.notnull()]

In [110]:
# Assigning data to DataFrame

finance_reviews['학력수준'] = 11
finance_reviews['학력수준']

나이
54    11
40    11
      ..
30    11
28    11
Name: 학력수준, Length: 100, dtype: int64

In [111]:
finance_reviews['소득'] = range(len(finance_reviews), 0 , -1)
finance_reviews['소득']

나이
54    100
40     99
     ... 
30      2
28      1
Name: 소득, Length: 100, dtype: int64

### Summary Functions and Maps

In [112]:
finance_reviews

Unnamed: 0_level_0,학력수준,소득
나이,Unnamed: 1_level_1,Unnamed: 2_level_1
54,11,100
40,11,99
...,...,...
30,11,2
28,11,1


In [113]:
# describe() : summary function
# It is for numerical data

finance_reviews.소득.describe()

count    100.00
mean      50.50
          ...  
75%       75.25
max      100.00
Name: 소득, Length: 8, dtype: float64

In [115]:
# For string data
import pandas as pd

titanic_reviews = pd.read_csv('C:/Users/pps/Desktop/배정환(인턴)/titanic.csv')

titanic_reviews

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True
890,0,3,male,32.0,0,0,7.7500,Q,Third,man,True,,Queenstown,no,True


In [116]:
# Summary for string data

titanic_reviews.sex.describe()

count      891
unique       2
top       male
freq       577
Name: sex, dtype: object

In [117]:
# Get some particular summary statistic about a column in a DataFrame or a Series

titanic_reviews.age.mean()

29.69911764705882

In [118]:
# To see the list of unique values

titanic_reviews.embark_town.unique()

array(['Southampton', 'Cherbourg', 'Queenstown', nan], dtype=object)