<a href="https://colab.research.google.com/github/bharatkaushik2015/Python-library-practice-notes/blob/main/PandasPractice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Data Structures in Pandas**

Two fundamental data structures used in pandas are:

1. Series: A 1-D array.

2. Data Frame: A 2-D array or two or more Series joined together

In [199]:
import numpy as np
import pandas as pd

In [200]:
# Task1
heights_A=pd.Series([176.2,158.4,167.6,156.2,161.4],index=['s1','s2','s3','s4','s5'])
print(heights_A.shape)

#Task2
weights_A=pd.Series([85.1,90.2,76.8,80.4,78.9],index=['s1','s2','s3','s4','s5'])
print(weights_A.dtypes)

(5,)
float64


In [201]:
heights_A

Unnamed: 0,0
s1,176.2
s2,158.4
s3,167.6
s4,156.2
s5,161.4


In [202]:
weights_A

Unnamed: 0,0
s1,85.1
s2,90.2
s3,76.8
s4,80.4
s5,78.9


In [203]:
df_A=pd.DataFrame({'Student_height':heights_A,'Student_weight':weights_A})
df_A


Unnamed: 0,Student_height,Student_weight
s1,176.2,85.1
s2,158.4,90.2
s3,167.6,76.8
s4,156.2,80.4
s5,161.4,78.9


Accessing Data from Pandas Data structures

In [204]:
# Accessing a single value
heights_A

Unnamed: 0,0
s1,176.2
s2,158.4
s3,167.6
s4,156.2
s5,161.4


In [205]:
heights_A.loc['s1'] #explicit indexing

176.2

In [206]:
heights_A.iloc[2]

167.6

In [207]:
heights_A.get('s1')

176.2

In [208]:
#Accessing a slice
heights_A[0:3] # -> works like how slicing works in list or numpy

Unnamed: 0,0
s1,176.2
s2,158.4
s3,167.6


In [209]:
heights_A['s1':'s4'] # doesn't exclude result for 's4'

Unnamed: 0,0
s1,176.2
s2,158.4
s3,167.6
s4,156.2


In [210]:
heights_A.loc['s6']=170.0 #Overwrites or write a new row

In [211]:
heights_A.loc['s1']=176

In [212]:
heights_A

Unnamed: 0,0
s1,176.0
s2,158.4
s3,167.6
s4,156.2
s5,161.4
s6,170.0


Knowing a Series

In [213]:
weights_A

Unnamed: 0,0
s1,85.1
s2,90.2
s3,76.8
s4,80.4
s5,78.9


In [214]:
# describe() gives statistical summary of data
weights_A.describe()

Unnamed: 0,0
count,5.0
mean,82.28
std,5.377453
min,76.8
25%,78.9
50%,80.4
75%,85.1
max,90.2


Knowing a DataFrame

In [215]:
df_A.info() # gives info about dataframe

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, s1 to s5
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Student_height  5 non-null      float64
 1   Student_weight  5 non-null      float64
dtypes: float64(2)
memory usage: 292.0+ bytes


In [216]:
df_A.describe() # gives statistical summary of numerical fields (default)

Unnamed: 0,Student_height,Student_weight
count,5.0,5.0
mean,163.96,82.28
std,8.073909,5.377453
min,156.2,76.8
25%,158.4,78.9
50%,161.4,80.4
75%,167.6,85.1
max,176.2,90.2


In [217]:
df_A.describe(include='all') # gives statistical summary of all fields

Unnamed: 0,Student_height,Student_weight
count,5.0,5.0
mean,163.96,82.28
std,8.073909,5.377453
min,156.2,76.8
25%,158.4,78.9
50%,161.4,80.4
75%,167.6,85.1
max,176.2,90.2


In [218]:
df_A["label"]=["B","A","C","A","B"]

In [219]:
df_A.describe(include='object') # gives statistical summary of categorical fields

Unnamed: 0,label
count,5
unique,3
top,B
freq,2


**Indexing**

Indexing refers to labeling data elements of a Series, a Data Frame.

These labels can be utilized for selecting portion of data from any of the defined data structures.

In [220]:
df=pd.DataFrame([[1,2],[3,4]], columns=['a','b'])

In [221]:
df.index=['bharat','rahil']

In [222]:
df

Unnamed: 0,a,b
bharat,1,2
rahil,3,4


In [223]:
df.index=pd.date_range(start='1/1/2018', end='1/02/2018') # we can use date_range function to create date index

In [224]:
df

Unnamed: 0,a,b
2018-01-01,1,2
2018-01-02,3,4


Heirarchical Indexing

In [225]:
df_A

Unnamed: 0,Student_height,Student_weight,label
s1,176.2,85.1,B
s2,158.4,90.2,A
s3,167.6,76.8,C
s4,156.2,80.4,A
s5,161.4,78.9,B


In [226]:
df_A.set_index(['Student_height','Student_weight'],inplace=True)

In [227]:
df_A.index

MultiIndex([(176.2, 85.1),
            (158.4, 90.2),
            (167.6, 76.8),
            (156.2, 80.4),
            (161.4, 78.9)],
           names=['Student_height', 'Student_weight'])

In [228]:
pd.bdate_range('11-Sep-2017', '17-Sep-2017', freq='2D') # works like date_range if frequency changed

DatetimeIndex(['2017-09-11', '2017-09-13', '2017-09-15', '2017-09-17'], dtype='datetime64[ns]', freq='2D')

In [229]:
pd.bdate_range('11-Sep-2017', '17-Sep-2017')

DatetimeIndex(['2017-09-11', '2017-09-12', '2017-09-13', '2017-09-14',
               '2017-09-15'],
              dtype='datetime64[ns]', freq='B')

select columns

In [230]:
df_A

Unnamed: 0_level_0,Unnamed: 1_level_0,label
Student_height,Student_weight,Unnamed: 2_level_1
176.2,85.1,B
158.4,90.2,A
167.6,76.8,C
156.2,80.4,A
161.4,78.9,B


In [231]:
df_A.reset_index(inplace=True)

In [232]:
df_A.Student_height

Unnamed: 0,Student_height
0,176.2
1,158.4
2,167.6
3,156.2
4,161.4


In [233]:
df_A[["Student_height","Student_weight"]]  # Multiple columns

Unnamed: 0,Student_height,Student_weight
0,176.2,85.1
1,158.4,90.2
2,167.6,76.8
3,156.2,80.4
4,161.4,78.9


In [234]:
df_A.loc[:,["Student_height"]] # selecting column using explicit index

Unnamed: 0,Student_height
0,176.2
1,158.4
2,167.6
3,156.2
4,161.4


In [235]:
df_A.loc[:,"Student_height":"label"] # selecting all columns using explicit index

Unnamed: 0,Student_height,Student_weight,label
0,176.2,85.1,B
1,158.4,90.2,A
2,167.6,76.8,C
3,156.2,80.4,A
4,161.4,78.9,B


In [236]:
df_A.loc[:,::2]

Unnamed: 0,Student_height,label
0,176.2,B
1,158.4,A
2,167.6,C
3,156.2,A
4,161.4,B


In [237]:
df_A.iloc[:,0] # selecting column using implicit index

Unnamed: 0,Student_height
0,176.2
1,158.4
2,167.6
3,156.2
4,161.4


In [238]:
df_A.iloc[:,[0,2]] # selecting column using implicit index

Unnamed: 0,Student_height,label
0,176.2,B
1,158.4,A
2,167.6,C
3,156.2,A
4,161.4,B


In [239]:
df_A.iloc[:,1::]

Unnamed: 0,Student_weight,label
0,85.1,B
1,90.2,A
2,76.8,C
3,80.4,A
4,78.9,B


Query rows by value

In [240]:

technologies= {
    'Courses':["Spark","PySpark","Hadoop","Python","Pandas"],
    'Fee' :[22000,25000,23000,24000,26000],
    'Duration':['30days','50days','30days', None,np.nan],
    'Discount':[1000,2300,1000,1200,2500]
          }
df = pd.DataFrame(technologies)
print("Create DataFrame:\n", df)

Create DataFrame:
    Courses    Fee Duration  Discount
0    Spark  22000   30days      1000
1  PySpark  25000   50days      2300
2   Hadoop  23000   30days      1000
3   Python  24000     None      1200
4   Pandas  26000      NaN      2500


In [241]:
df.query('Courses=="Spark"')

Unnamed: 0,Courses,Fee,Duration,Discount
0,Spark,22000,30days,1000


In [242]:
df[df["Courses"]=="Spark"]

Unnamed: 0,Courses,Fee,Duration,Discount
0,Spark,22000,30days,1000


In [243]:
df

Unnamed: 0,Courses,Fee,Duration,Discount
0,Spark,22000,30days,1000
1,PySpark,25000,50days,2300
2,Hadoop,23000,30days,1000
3,Python,24000,,1200
4,Pandas,26000,,2500


In [244]:
df.query('Courses in ["Spark","PySpark"]') # Querying using df.query

Unnamed: 0,Courses,Fee,Duration,Discount
0,Spark,22000,30days,1000
1,PySpark,25000,50days,2300


In [245]:
df[df.Courses.isin(["Spark","PySpark"])] # querying using masking

Unnamed: 0,Courses,Fee,Duration,Discount
0,Spark,22000,30days,1000
1,PySpark,25000,50days,2300


In [246]:
df

Unnamed: 0,Courses,Fee,Duration,Discount
0,Spark,22000,30days,1000
1,PySpark,25000,50days,2300
2,Hadoop,23000,30days,1000
3,Python,24000,,1200
4,Pandas,26000,,2500


In [247]:
df.query("`Fee` >= 23000 and `Fee` <= 24000")

Unnamed: 0,Courses,Fee,Duration,Discount
2,Hadoop,23000,30days,1000
3,Python,24000,,1200


In [248]:
# column -> `` use backtick
# column value -> use '' single inverted commas
df.query("`Fee` == 25000 and `Duration` == '50days'")

Unnamed: 0,Courses,Fee,Duration,Discount
1,PySpark,25000,50days,2300


In [249]:
#query row using apply function
df

Unnamed: 0,Courses,Fee,Duration,Discount
0,Spark,22000,30days,1000
1,PySpark,25000,50days,2300
2,Hadoop,23000,30days,1000
3,Python,24000,,1200
4,Pandas,26000,,2500


In [250]:
# apply function: axis-0 --> index , apply function to each column
# apply function: axis-1 --> row , apply function to each row

df.apply(lambda x : x[df["Courses"].isin(["Spark","PySpark"])])

Unnamed: 0,Courses,Fee,Duration,Discount
0,Spark,22000,30days,1000
1,PySpark,25000,50days,2300


In [251]:
df.apply(lambda x : x[0]) # applied to each column

Unnamed: 0,0
Courses,Spark
Fee,22000
Duration,30days
Discount,1000


In [252]:
df.apply(lambda x : x[0], axis=1) # applied to each row

  df.apply(lambda x : x[0], axis=1) # applied to each row


Unnamed: 0,0
0,Spark
1,PySpark
2,Hadoop
3,Python
4,Pandas


Get cell values

In [253]:
df

Unnamed: 0,Courses,Fee,Duration,Discount
0,Spark,22000,30days,1000
1,PySpark,25000,50days,2300
2,Hadoop,23000,30days,1000
3,Python,24000,,1200
4,Pandas,26000,,2500


In [254]:
df.loc[0]["Fee"]

22000

In [255]:
df.loc[0,"Fee"]

22000

In [256]:
df.iloc[0,1]

22000

In [257]:
df.iloc[0][1]

  df.iloc[0][1]


22000

In [258]:
df["Fee"].values[3], df["Fee"].values[0]

(24000, 22000)

Add New column

In [259]:
df

Unnamed: 0,Courses,Fee,Duration,Discount
0,Spark,22000,30days,1000
1,PySpark,25000,50days,2300
2,Hadoop,23000,30days,1000
3,Python,24000,,1200
4,Pandas,26000,,2500


In [260]:
#Method-1
df["Net_Fee"]=df["Fee"]-df["Discount"]

In [261]:
df

Unnamed: 0,Courses,Fee,Duration,Discount,Net_Fee
0,Spark,22000,30days,1000,21000
1,PySpark,25000,50days,2300,22700
2,Hadoop,23000,30days,1000,22000
3,Python,24000,,1200,22800
4,Pandas,26000,,2500,23500


In [262]:
#Method-2
df.assign(Net_Fee=df["Fee"]-df["Discount"])

Unnamed: 0,Courses,Fee,Duration,Discount,Net_Fee
0,Spark,22000,30days,1000,21000
1,PySpark,25000,50days,2300,22700
2,Hadoop,23000,30days,1000,22000
3,Python,24000,,1200,22800
4,Pandas,26000,,2500,23500


In [263]:
# discount percent
df.assign(Discount_per=lambda x:(x.Discount/x.Fee)*100)

Unnamed: 0,Courses,Fee,Duration,Discount,Net_Fee,Discount_per
0,Spark,22000,30days,1000,21000,4.545455
1,PySpark,25000,50days,2300,22700,9.2
2,Hadoop,23000,30days,1000,22000,4.347826
3,Python,24000,,1200,22800,5.0
4,Pandas,26000,,2500,23500,9.615385


In [264]:
#Method-3 Syntax: df.insert(loc position where you want column, "column_name", [values])
df.insert(3, "Constant",[i for i in range(5)])

In [265]:
df

Unnamed: 0,Courses,Fee,Duration,Constant,Discount,Net_Fee
0,Spark,22000,30days,0,1000,21000
1,PySpark,25000,50days,1,2300,22700
2,Hadoop,23000,30days,2,1000,22000
3,Python,24000,,3,1200,22800
4,Pandas,26000,,4,2500,23500


Rename Column

In [266]:
# Method-1
df.columns

Index(['Courses', 'Fee', 'Duration', 'Constant', 'Discount', 'Net_Fee'], dtype='object')

In [267]:
df.columns.values[3]="integer"

In [268]:
df

Unnamed: 0,Courses,Fee,Duration,integer,Discount,Net_Fee
0,Spark,22000,30days,0,1000,21000
1,PySpark,25000,50days,1,2300,22700
2,Hadoop,23000,30days,2,1000,22000
3,Python,24000,,3,1200,22800
4,Pandas,26000,,4,2500,23500


In [269]:
# Method-2 rename method with columns argument & creates a copy, hence provided inplace true
df.rename(columns={"Fee":"fee"}, inplace=True)


Dropping rows

In [270]:
df

Unnamed: 0,Courses,fee,Duration,integer,Discount,Net_Fee
0,Spark,22000,30days,0,1000,21000
1,PySpark,25000,50days,1,2300,22700
2,Hadoop,23000,30days,2,1000,22000
3,Python,24000,,3,1200,22800
4,Pandas,26000,,4,2500,23500


In [271]:
#Method syntax: df.drop(index=index_num)
df.drop(index=2, inplace=True)

In [272]:
df

Unnamed: 0,Courses,fee,Duration,integer,Discount,Net_Fee
0,Spark,22000,30days,0,1000,21000
1,PySpark,25000,50days,1,2300,22700
3,Python,24000,,3,1200,22800
4,Pandas,26000,,4,2500,23500


Dropping column

In [273]:
df

Unnamed: 0,Courses,fee,Duration,integer,Discount,Net_Fee
0,Spark,22000,30days,0,1000,21000
1,PySpark,25000,50days,1,2300,22700
3,Python,24000,,3,1200,22800
4,Pandas,26000,,4,2500,23500


In [274]:
# Method: syntax df.drop(columns="value" or list of multiple values)
df.drop(columns="integer",inplace=True)
df

Unnamed: 0,Courses,fee,Duration,Discount,Net_Fee
0,Spark,22000,30days,1000,21000
1,PySpark,25000,50days,2300,22700
3,Python,24000,,1200,22800
4,Pandas,26000,,2500,23500


Cast Data Type
1. Applying the .astype() method to convert data types directly, specifying the desired dtype.
2. Utilizing the .to_numeric() function to coerce object types into numeric types, with options for handling errors and coercing strings.
3. Using the infer_objects() method to automatically infer and convert data types.
4. Employing the as_type() method to convert data types with specific parameters like nullable integers.
5. Utilizing custom functions or mapping techniques for more complex type conversions.

In [275]:
df.insert(5, "categorical",[i for i in range(4)])  # inplace operation by default

In [276]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, 0 to 4
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Courses      4 non-null      object
 1   fee          4 non-null      int64 
 2   Duration     2 non-null      object
 3   Discount     4 non-null      int64 
 4   Net_Fee      4 non-null      int64 
 5   categorical  4 non-null      int64 
dtypes: int64(4), object(2)
memory usage: 224.0+ bytes


In [277]:
# Method: syntax -> provide dictionary to astype method
df=df.astype({"categorical" : object})
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, 0 to 4
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Courses      4 non-null      object
 1   fee          4 non-null      int64 
 2   Duration     2 non-null      object
 3   Discount     4 non-null      int64 
 4   Net_Fee      4 non-null      int64 
 5   categorical  4 non-null      object
dtypes: int64(3), object(3)
memory usage: 224.0+ bytes


In [278]:
# Convert catgorical to numeric using pd.to_numeric
df["categorical"]=pd.to_numeric(df["categorical"])
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, 0 to 4
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Courses      4 non-null      object
 1   fee          4 non-null      int64 
 2   Duration     2 non-null      object
 3   Discount     4 non-null      int64 
 4   Net_Fee      4 non-null      int64 
 5   categorical  4 non-null      int64 
dtypes: int64(4), object(2)
memory usage: 224.0+ bytes


Count number of rows

In [279]:
# Optimized method (because other methods are very slow)
# Method 1
print(df.shape[0])
# Method 2
print(len(df.index))

4
4


In [280]:
# give count of all non-Nan values for each column
df.count()

Unnamed: 0,0
Courses,4
fee,4
Duration,2
Discount,4
Net_Fee,4
categorical,4


Apply Function

In [281]:
# apply function: axis-0 --> index , apply function to each column
# apply function: axis-1 --> row , apply function to each row
df

Unnamed: 0,Courses,fee,Duration,Discount,Net_Fee,categorical
0,Spark,22000,30days,1000,21000,0
1,PySpark,25000,50days,2300,22700,1
3,Python,24000,,1200,22800,2
4,Pandas,26000,,2500,23500,3


In [282]:
def commission(x):
    x["commission"] = x["Net_Fee"]*0.05
    return x["commission"]

df["commission"]=df.apply(commission, axis=1)
df

Unnamed: 0,Courses,fee,Duration,Discount,Net_Fee,categorical,commission
0,Spark,22000,30days,1000,21000,0,1050.0
1,PySpark,25000,50days,2300,22700,1,1135.0
3,Python,24000,,1200,22800,2,1140.0
4,Pandas,26000,,2500,23500,3,1175.0


In [283]:
# In series no need to mention axis
df["categorical"].apply(lambda x:x+1)

Unnamed: 0,categorical
0,1
1,2
3,3
4,4


Column values as list

In [284]:
# wrong method
arr=df.columns
type(arr)

pandas.core.indexes.base.Index

In [285]:
# Method 1
arr1=df.columns.tolist()
arr1

['Courses',
 'fee',
 'Duration',
 'Discount',
 'Net_Fee',
 'categorical',
 'commission']

In [286]:
# method 2
arr2=list(df.columns)
arr2

['Courses',
 'fee',
 'Duration',
 'Discount',
 'Net_Fee',
 'categorical',
 'commission']

In [287]:
# method 3
df.columns.values.tolist()

['Courses',
 'fee',
 'Duration',
 'Discount',
 'Net_Fee',
 'categorical',
 'commission']

Data Aggregation in pandas

In [288]:
# Group by Function
df1=pd.DataFrame({"Name":['A','B','C','A','B','C'],"Marks":[0,1,2,3,4,5]})
df1

Unnamed: 0,Name,Marks
0,A,0
1,B,1
2,C,2
3,A,3
4,B,4
5,C,5


In [289]:
# creates a group by object
df1.groupby('Name')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x791a9258bf40>

In [290]:
df1.groupby('Name').sum()

Unnamed: 0_level_0,Marks
Name,Unnamed: 1_level_1
A,3
B,5
C,7


In [291]:
df2 = df1.groupby('Name').mean() # Notice after group by column 'Name' is set to index

In [292]:
df2.index
df2.reset_index(inplace=True)

In [293]:
df2

Unnamed: 0,Name,Marks
0,A,1.5
1,B,2.5
2,C,3.5


Advance groupby functions
1. aggregate func
2. filter
3. transform

In [294]:
# aggregate
df1.groupby('Name').aggregate(['sum','mean'])

Unnamed: 0_level_0,Marks,Marks
Unnamed: 0_level_1,sum,mean
Name,Unnamed: 1_level_2,Unnamed: 2_level_2
A,3,1.5
B,5,2.5
C,7,3.5


In [295]:
# filter -> inside lambda it's calculating group mean and filtering based on our condition
df1.groupby('Name').filter(lambda x:x['Marks'].mean()>2)

Unnamed: 0,Name,Marks
1,B,1
2,C,2
4,B,4
5,C,5


In [296]:
# Transform -> inside lambda it will calculate group aggregation but result will have same number of rows as main df before group by
# and groupby key will not be present
df1.groupby('Name').transform(lambda x:x.sum())

Unnamed: 0,Marks
0,3
1,5
2,7
3,3
4,5
5,7


Joining Data Frames
1. df.join
2. pd.merge
3. concat

In [297]:
# In pandas join is performed on row indices while in SQL join happens based on column
# syntax: df.join(dataframe or series, on='column_name', how='left,right'(by default left))
technologies = {
    'Courses':["Spark","PySpark","Python","pandas"],
    'Fee' :[20000,25000,22000,30000],
    'Duration':['30days','40days','35days','50days'],
              }
index_labels=['r1','r2','r3','r4']
df1 = pd.DataFrame(technologies,index=index_labels)
print("First DataFrame:\n", df1)
technologies2 = {
    'Courses':["Spark","Java","Python","Go"],
    'Discount':[2000,2300,1200,2000]
              }
index_labels2=['r1','r6','r3','r5']
df2 = pd.DataFrame(technologies2,index=index_labels2)
print("Second DataFRame:\n", df2)

First DataFrame:
     Courses    Fee Duration
r1    Spark  20000   30days
r2  PySpark  25000   40days
r3   Python  22000   35days
r4   pandas  30000   50days
Second DataFRame:
    Courses  Discount
r1   Spark      2000
r6    Java      2300
r3  Python      1200
r5      Go      2000


In [298]:
df1.join(df2,rsuffix='_df1',lsuffix='_df2',how='left')

Unnamed: 0,Courses_df2,Fee,Duration,Courses_df1,Discount
r1,Spark,20000,30days,Spark,2000.0
r2,PySpark,25000,40days,,
r3,Python,22000,35days,Python,1200.0
r4,pandas,30000,50days,,


In [299]:
df1.join(df2,rsuffix='_df1',lsuffix='_df2',how='inner')

Unnamed: 0,Courses_df2,Fee,Duration,Courses_df1,Discount
r1,Spark,20000,30days,Spark,2000
r3,Python,22000,35days,Python,1200


In [300]:
# Merge -> better than join don't even provide extra column how='inner' by default
# syntax-1
pd.merge(df1,df2,on='Courses')

Unnamed: 0,Courses,Fee,Duration,Discount
0,Spark,20000,30days,2000
1,Python,22000,35days,1200


In [301]:
# syntax-2
df1.merge(df2,on='Courses')

Unnamed: 0,Courses,Fee,Duration,Discount
0,Spark,20000,30days,2000
1,Python,22000,35days,1200


In [302]:
# concat -> default axis value 0 , on top of each other
pd.concat([df1,df2], axis=0)

Unnamed: 0,Courses,Fee,Duration,Discount
r1,Spark,20000.0,30days,
r2,PySpark,25000.0,40days,
r3,Python,22000.0,35days,
r4,pandas,30000.0,50days,
r1,Spark,,,2000.0
r6,Java,,,2300.0
r3,Python,,,1200.0
r5,Go,,,2000.0


In [303]:
pd.concat([df1,df2], axis=1) # axis =1 side by side

Unnamed: 0,Courses,Fee,Duration,Courses.1,Discount
r1,Spark,20000.0,30days,Spark,2000.0
r2,PySpark,25000.0,40days,,
r3,Python,22000.0,35days,Python,1200.0
r4,pandas,30000.0,50days,,
r6,,,,Java,2300.0
r5,,,,Go,2000.0
