In [1]:
import numpy as np
import pandas as pd

# Reading and writing text files

In [2]:
# This is how to read a text file as dataframe in Python
df = pd.read_csv('/content/sample_data/sample.txt', header=None)

# When the header is provided as None, there would be no column names
df

Unnamed: 0,0,1,2
0,man,24,56
1,woman,34,89
2,man,78,60
3,man,89,58
4,man,27,90
5,woman,22,54


In [3]:
# Reading using read_table
df = pd.read_table('/content/sample_data/sample.txt', sep=',', header=None)
df

Unnamed: 0,0,1,2
0,man,24,56
1,woman,34,89
2,man,78,60
3,man,89,58
4,man,27,90
5,woman,22,54


In [4]:
# Reading only a few number of rows
df = pd.read_csv('/content/sample_data/sample.txt', header=None, nrows=3)
df

Unnamed: 0,0,1,2
0,man,24,56
1,woman,34,89
2,man,78,60


In [5]:
import sys

# Writing dataframe back into a csv file
df.to_csv(sys.stdout, header=None)

0,man,24,56
1,woman,34,89
2,man,78,60


# Reading data from HTML or a website

In [6]:
df_list = pd.io.html.read_html('https://en.wikipedia.org/wiki/List_of_Tom_Holland_performances')
df_list[0].head()

Unnamed: 0,Tom Holland,Tom Holland.1
0,Holland in 2018,Holland in 2018
1,Born,"Thomas Stanley Holland (age 25)London, England"
2,Education,Wimbledon CollegeBRIT School for Performing Ar...
3,Occupation,Actor
4,Years active,2006–present


# Merge

In [7]:
df1 = pd.DataFrame({'key': (1, 2, 3), 'values 1': ('a', 'b', 'c')})
df2 = pd.DataFrame({'key': (1, 2, 3), 'values 2': ('d', 'e', 'f')})

In [8]:
df3 = pd.merge(df1, df2, on='key')
df3

Unnamed: 0,key,values 1,values 2
0,1,a,d
1,2,b,e
2,3,c,f


In [9]:
df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'group': ['Accounting', 'Engineering', 'Engineering', 'HR']})
df2 = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake', 'Sue'],
                    'hire_date': [2004, 2008, 2012, 2014]})
display(df1)
display(df2)

Unnamed: 0,employee,group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,Engineering
3,Sue,HR


Unnamed: 0,employee,hire_date
0,Lisa,2004
1,Bob,2008
2,Jake,2012
3,Sue,2014


In [10]:
pd.merge(df1, df2, how='right')

Unnamed: 0,employee,group,hire_date
0,Lisa,Engineering,2004
1,Bob,Accounting,2008
2,Jake,Engineering,2012
3,Sue,HR,2014


In [11]:
df4 = pd.DataFrame({'group': ['Accounting', 'Engineering', 'HR'],
                    'supervisor': ['Carly', 'Guido', 'Steve']})
display(df1, df4)

Unnamed: 0,employee,group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,Engineering
3,Sue,HR


Unnamed: 0,group,supervisor
0,Accounting,Carly
1,Engineering,Guido
2,HR,Steve


In [12]:
pd.merge(df1, df4, how='right')

Unnamed: 0,employee,group,supervisor
0,Bob,Accounting,Carly
1,Jake,Engineering,Guido
2,Lisa,Engineering,Guido
3,Sue,HR,Steve


# Merge by index

In [13]:
df1[:-1]

Unnamed: 0,employee,group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,Engineering


In [14]:
df5 = pd.DataFrame(['Male', 'Female'], index=['Accounting', 'Engineering'])
df5

Unnamed: 0,0
Accounting,Male
Engineering,Female


In [15]:
pd.merge(df1, df5, left_on='group', right_index=True)

Unnamed: 0,employee,group,0
0,Bob,Accounting,Male
1,Jake,Engineering,Female
2,Lisa,Engineering,Female


In [16]:
# Working with datasets that have sub index
df_one = pd.DataFrame({'key1': ['Melvin', 'Albin', 'Malu', 'Ann'], 'key2': [28, 23, 25, 24]})
df_one

Unnamed: 0,key1,key2
0,Melvin,28
1,Albin,23
2,Malu,25
3,Ann,24


In [17]:
df_two = pd.DataFrame(np.arange(16).reshape(8, 2), index=[['Albin', 'Albin', 'Malu', 'Malu', 'Ann', 'Ann', 'Melvin', 'Melvin'], 
                                                         [23, 11, 25, 13, 24, 12, 28, 14]])
df_two

Unnamed: 0,Unnamed: 1,0,1
Albin,23,0,1
Albin,11,2,3
Malu,25,4,5
Malu,13,6,7
Ann,24,8,9
Ann,12,10,11
Melvin,28,12,13
Melvin,14,14,15


In [18]:
pd.merge(df_one, df_two, left_on=['key1', 'key2'], right_index=True)

Unnamed: 0,key1,key2,0,1
0,Melvin,28,12,13
1,Albin,23,0,1
2,Malu,25,4,5
3,Ann,24,8,9


# Using Join instead of merge

In [19]:
df_one.join(df_two, on=['key1', 'key2'], how='left')

Unnamed: 0,key1,key2,0,1
0,Melvin,28,12,13
1,Albin,23,0,1
2,Malu,25,4,5
3,Ann,24,8,9


In [20]:
# Using how = left
df_one['new_column'] = [90, 90, 78, 67]
df_one

Unnamed: 0,key1,key2,new_column
0,Melvin,28,90
1,Albin,23,90
2,Malu,25,78
3,Ann,24,67


In [21]:
df_one.join(df_two, on=['key1', 'key2'], how='right')

Unnamed: 0,key1,key2,new_column,0,1
1,Albin,23,90.0,0,1
3,Albin,11,,2,3
2,Malu,25,78.0,4,5
3,Malu,13,,6,7
3,Ann,24,67.0,8,9
3,Ann,12,,10,11
0,Melvin,28,90.0,12,13
3,Melvin,14,,14,15


# Concat

In [22]:
# Numpy
a = np.arange(10).reshape(5, 2)
b = np.arange(20, 30).reshape(5, 2)

display(np.concatenate((a, b), axis=1))
print()
display(np.concatenate((a, b), axis=0))

array([[ 0,  1, 20, 21],
       [ 2,  3, 22, 23],
       [ 4,  5, 24, 25],
       [ 6,  7, 26, 27],
       [ 8,  9, 28, 29]])




array([[ 0,  1],
       [ 2,  3],
       [ 4,  5],
       [ 6,  7],
       [ 8,  9],
       [20, 21],
       [22, 23],
       [24, 25],
       [26, 27],
       [28, 29]])

In [23]:
# Pandas
a_df = pd.DataFrame(a)
b_df = pd.DataFrame(b)
display(pd.concat([a_df, b_df], axis=0))
print()
display(pd.concat([a_df, b_df], axis=1))

# with keys
print()
display(pd.concat([a_df, b_df], axis=1, keys=['cat1', 'cat2']))

Unnamed: 0,0,1
0,0,1
1,2,3
2,4,5
3,6,7
4,8,9
0,20,21
1,22,23
2,24,25
3,26,27
4,28,29





Unnamed: 0,0,1,0.1,1.1
0,0,1,20,21
1,2,3,22,23
2,4,5,24,25
3,6,7,26,27
4,8,9,28,29





Unnamed: 0_level_0,cat1,cat1,cat2,cat2
Unnamed: 0_level_1,0,1,0,1
0,0,1,20,21
1,2,3,22,23
2,4,5,24,25
3,6,7,26,27
4,8,9,28,29


In [25]:
# Ignoring the index
a_df = pd.DataFrame(a)
b_df = pd.DataFrame(b)
display(pd.concat([a_df, b_df], axis=1, ignore_index=True))

Unnamed: 0,0,1,2,3
0,0,1,20,21
1,2,3,22,23
2,4,5,24,25
3,6,7,26,27
4,8,9,28,29
