In [1]:

# We can merge dataframes in a number of different ways!

import pandas as pd

df1 = pd.DataFrame(
    {
        "A": ["A0", "A1", "A2", "A3"],
        "B": ["B0", "B1", "B2", "B3"],
        "C": ["C0", "C1", "C2", "C3"],
        "D": ["D0", "D1", "D2", "D3"],
        "E": ["A0", "A1", "D2", "D3"],
    },
    index=[0, 1, 2, 3],
)



df2 = pd.DataFrame(
    {
        "A": ["A4", "A5", "A6", "A7"],
        "B": ["B4", "B5", "B6", "B7"],
        "C": ["C4", "C5", "C6", "C7"],
        "D": ["D4", "D5", "D6", "D7"],
        "E": ["A0", "A1", "D2", "D3"],
    },
    index=[4, 5, 6, 7],
)



df3 = pd.DataFrame(
    {
        "A": ["A8", "A9", "A10", "A11"],
        "B": ["B8", "B9", "B10", "B11"],
        "C": ["C8", "C9", "C10", "C11"],
        "D": ["D8", "D9", "D10", "D11"],
    },
    index=[8, 9, 10, 11],
)


# concat is similar to union, in the SQL world, if axis=.

frames = [df1, df2, df3]
result = pd.concat(frames, axis=0)
result


Unnamed: 0,A,B,C,D,E
0,A0,B0,C0,D0,A0
1,A1,B1,C1,D1,A1
2,A2,B2,C2,D2,D2
3,A3,B3,C3,D3,D3
4,A4,B4,C4,D4,A0
5,A5,B5,C5,D5,A1
6,A6,B6,C6,D6,D2
7,A7,B7,C7,D7,D3
8,A8,B8,C8,D8,
9,A9,B9,C9,D9,


In [3]:

# the merge is done by row, instead of by column, if axis=1.

frames = [df1, df2, df3]
result = pd.concat(frames, axis=1)
result


Unnamed: 0,A,B,C,D,E,A.1,B.1,C.1,D.1,E.1,A.2,B.2,C.2,D.2
0,A0,B0,C0,D0,A0,,,,,,,,,
1,A1,B1,C1,D1,A1,,,,,,,,,
2,A2,B2,C2,D2,D2,,,,,,,,,
3,A3,B3,C3,D3,D3,,,,,,,,,
4,,,,,,A4,B4,C4,D4,A0,,,,
5,,,,,,A5,B5,C5,D5,A1,,,,
6,,,,,,A6,B6,C6,D6,D2,,,,
7,,,,,,A7,B7,C7,D7,D3,,,,
8,,,,,,,,,,,A8,B8,C8,D8
9,,,,,,,,,,,A9,B9,C9,D9


In [4]:

# The indicator parameter creates a column in the merged DataFrame that indicates where the key value in rows come from.

result = df1.merge(df2, how="outer", indicator=True)
result


Unnamed: 0,A,B,C,D,E,_merge
0,A0,B0,C0,D0,A0,left_only
1,A1,B1,C1,D1,A1,left_only
2,A2,B2,C2,D2,D2,left_only
3,A3,B3,C3,D3,D3,left_only
4,A4,B4,C4,D4,A0,right_only
5,A5,B5,C5,D5,A1,right_only
6,A6,B6,C6,D6,D2,right_only
7,A7,B7,C7,D7,D3,right_only


In [4]:

# Use intersection of keys from both frames

result = pd.merge(df1, df2, how="inner", left_on='E', right_on='E')
result


Unnamed: 0,A_x,B_x,C_x,D_x,E,A_y,B_y,C_y,D_y
0,A0,B0,C0,D0,A0,A4,B4,C4,D4
1,A1,B1,C1,D1,A1,A5,B5,C5,D5
2,A2,B2,C2,D2,D2,A6,B6,C6,D6
3,A3,B3,C3,D3,D3,A7,B7,C7,D7


In [5]:

# use keys from the left DF or the right DF

result = pd.merge(df1, df2, how="left", left_on='E', right_on='E')
result


Unnamed: 0,A_x,B_x,C_x,D_x,E,A_y,B_y,C_y,D_y
0,A0,B0,C0,D0,A0,A4,B4,C4,D4
1,A1,B1,C1,D1,A1,A5,B5,C5,D5
2,A2,B2,C2,D2,D2,A6,B6,C6,D6
3,A3,B3,C3,D3,D3,A7,B7,C7,D7


In [6]:

# Use union of keys from both frames

result = pd.merge(df1, df2, how="outer")
result


Unnamed: 0,A,B,C,D,E
0,A0,B0,C0,D0,A0
1,A1,B1,C1,D1,A1
2,A2,B2,C2,D2,D2
3,A3,B3,C3,D3,D3
4,A4,B4,C4,D4,A0
5,A5,B5,C5,D5,A1
6,A6,B6,C6,D6,D2
7,A7,B7,C7,D7,D3


In [7]:

# Create the cartesian product of rows of both frames

result = pd.merge(df1, df2, how="cross")
result

# Resource
# https://pandas.pydata.org/docs/user_guide/merging.html


Unnamed: 0,A_x,B_x,C_x,D_x,E_x,A_y,B_y,C_y,D_y,E_y
0,A0,B0,C0,D0,A0,A4,B4,C4,D4,A0
1,A0,B0,C0,D0,A0,A5,B5,C5,D5,A1
2,A0,B0,C0,D0,A0,A6,B6,C6,D6,D2
3,A0,B0,C0,D0,A0,A7,B7,C7,D7,D3
4,A1,B1,C1,D1,A1,A4,B4,C4,D4,A0
5,A1,B1,C1,D1,A1,A5,B5,C5,D5,A1
6,A1,B1,C1,D1,A1,A6,B6,C6,D6,D2
7,A1,B1,C1,D1,A1,A7,B7,C7,D7,D3
8,A2,B2,C2,D2,D2,A4,B4,C4,D4,A0
9,A2,B2,C2,D2,D2,A5,B5,C5,D5,A1


In [8]:

# take a random sample from a dataframe

print(result.shape)
rnd_samp = result.sample(frac=0.25)
print(rnd_samp.shape)
rnd_samp.head()


(16, 10)
(4, 10)


Unnamed: 0,A_x,B_x,C_x,D_x,E_x,A_y,B_y,C_y,D_y,E_y
11,A2,B2,C2,D2,D2,A7,B7,C7,D7,D3
4,A1,B1,C1,D1,A1,A4,B4,C4,D4,A0
14,A3,B3,C3,D3,D3,A6,B6,C6,D6,D2
6,A1,B1,C1,D1,A1,A6,B6,C6,D6,D2


In [42]:

# We can merge two dataframes based on closest, but not exact matching, times!!!
# This is similar to a left-join except that we match on nearest key rather than equal keys.

import pandas as pd

quotes = pd.DataFrame(
    {
        "time": [
            pd.Timestamp("2016-05-25 13:30:00.023"),
            pd.Timestamp("2016-05-25 13:30:00.023"),
            pd.Timestamp("2016-05-25 13:30:00.030"),
            pd.Timestamp("2016-05-25 13:30:00.041"),
            pd.Timestamp("2016-05-25 13:30:00.048"),
            pd.Timestamp("2016-05-25 13:30:00.049"),
            pd.Timestamp("2016-05-25 13:30:00.072"),
            pd.Timestamp("2016-05-25 13:30:00.075")
        ],
        "ticker": [
               "GOOG",
               "MSFT",
               "MSFT",
               "MSFT",
               "GOOG",
               "AAPL",
               "GOOG",
               "MSFT"
           ],
           "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],
           "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03]
    }
)

print(quotes)

trades = pd.DataFrame(
       {
           "time": [
               pd.Timestamp("2016-05-25 13:30:00.023"),
               pd.Timestamp("2016-05-25 13:30:00.038"),
               pd.Timestamp("2016-05-25 13:30:00.048"),
               pd.Timestamp("2016-05-25 13:30:00.048"),
               pd.Timestamp("2016-05-25 13:30:00.048")
           ],
           "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"],
           "price": [51.95, 51.95, 720.77, 720.92, 98.0],
           "quantity": [75, 155, 100, 100, 100]
       }
   )

print(trades)


                     time ticker     bid     ask
0 2016-05-25 13:30:00.023   GOOG  720.50  720.93
1 2016-05-25 13:30:00.023   MSFT   51.95   51.96
2 2016-05-25 13:30:00.030   MSFT   51.97   51.98
3 2016-05-25 13:30:00.041   MSFT   51.99   52.00
4 2016-05-25 13:30:00.048   GOOG  720.50  720.93
5 2016-05-25 13:30:00.049   AAPL   97.99   98.01
6 2016-05-25 13:30:00.072   GOOG  720.50  720.88
7 2016-05-25 13:30:00.075   MSFT   52.01   52.03
                     time ticker   price  quantity
0 2016-05-25 13:30:00.023   MSFT   51.95        75
1 2016-05-25 13:30:00.038   MSFT   51.95       155
2 2016-05-25 13:30:00.048   GOOG  720.77       100
3 2016-05-25 13:30:00.048   GOOG  720.92       100
4 2016-05-25 13:30:00.048   AAPL   98.00       100


In [41]:

df_merged = pd.merge_asof(trades, quotes, on="time", by="ticker", tolerance=pd.Timedelta("2ms"))
df_merged

# Resource:
# https://pandas.pydata.org/docs/reference/api/pandas.merge_asof.html?highlight=pa


Unnamed: 0,time,ticker,price,quantity,bid,ask
0,2016-05-25 13:30:00.023,MSFT,51.95,75,51.95,51.96
1,2016-05-25 13:30:00.038,MSFT,51.95,155,,
2,2016-05-25 13:30:00.048,GOOG,720.77,100,720.5,720.93
3,2016-05-25 13:30:00.048,GOOG,720.92,100,720.5,720.93
4,2016-05-25 13:30:00.048,AAPL,98.0,100,,


In [None]:

# We can merge two dataframes based on closest latitude coordinates and longitude coordinates!!!
# We must create a GeoDataFrame when starting from a regular DataFrame.

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import geopandas as gpd

import numpy as np
from geopy.geocoders import Nominatim
from geopy.point import Point

geolocator = Nominatim(user_agent="test")

def reverse_geocoding(lat, lon):
    try:
        location = geolocator.reverse(Point(lat, lon))
        return location.raw['place_id']
    except:
        return None

df1 = pd.DataFrame(data={
       'name': ['post', 'sutter', 'oak'],
       'Lat': [41.389474, 41.383093, 41.373258],
       'Long': [2.156421, 2.181116, 2.159358]
      })


df2 = pd.DataFrame(data={
       'id': [0, 1, 2],
       'col1': ['xx','yy','zz'],
       'Lat': [37.787994, 37.789575, 37.813122],
       'Long': [2.156421, -2.181116, 2.168779]
      })

df1['address'] = np.vectorize(reverse_geocoding)(df1['Lat'], df1['Long'])
df2['address'] = np.vectorize(reverse_geocoding)(df2['Lat'], df2['Long'])

result = pd.merge(df1, df2, how='left', left_on='address', right_on='address')

print(result)


In [34]:

gdf1 = geopandas.GeoDataFrame(df1, geometry=geopandas.points_from_xy(df1.Long, df1.Lat))
print(gdf1.dtypes)

gdf2 = geopandas.GeoDataFrame(df2, geometry=geopandas.points_from_xy(df2.Long, df2.Lat))
print(gdf2.dtypes)


name           object
Lat           float64
Long          float64
address         int32
geometry1      object
geometry     geometry
dtype: object
id              int64
col1           object
Lat           float64
Long          float64
address        object
geometry2      object
geometry     geometry
dtype: object


In [37]:

cities_w_country_data = geopandas.sjoin_nearest(gdf1, gdf2)
cities_w_country_data.head(3)

# Resource:
# https://geopandas.org/en/stable/docs/reference/api/geopandas.sjoin_nearest.html
# https://gis.stackexchange.com/questions/404196/create-a-column-geometry-of-points-with-longitude-and-latitude-data-given-in-a


Unnamed: 0,name,Lat_left,Long_left,address_left,geometry1,geometry,index_right,id,col1,Lat_right,Long_right,address_right,geometry2
0,post,41.389474,2.156421,168079376,"2 9m 23.1156s N, 41 23m 22.1064s E",POINT (2.15642 41.38947),2,2,zz,37.813122,2.168779,,"2 10m 7.6044s N, 37 48m 47.2392s E"
1,sutter,41.383093,2.181116,79019270,"2 10m 52.0176s N, 41 22m 59.1348s E",POINT (2.18112 41.38309),2,2,zz,37.813122,2.168779,,"2 10m 7.6044s N, 37 48m 47.2392s E"
2,oak,41.373258,2.159358,274568903,"2 9m 33.6888s N, 41 22m 23.7288s E",POINT (2.15936 41.37326),2,2,zz,37.813122,2.168779,,"2 10m 7.6044s N, 37 48m 47.2392s E"
