In [1]:
import pandas as pd
import numpy as np

Along with the data, you can optionally pass **index** (row labels) and **columns** (column labels) arguments

## From dict of Series or dicts


In [2]:
dic = {
    'column1': pd.Series([1, 2, 3], index=['a', 'b', 'c']), # you can have any datatype here
    'column2': pd.Series([3, 4, 5], index=['a', 'b', 'c']),
}

In [3]:
pd.DataFrame(dic)

Unnamed: 0,column1,column2
a,1,3
b,2,4
c,3,5


in this example the **indexes** are match, but what if they don't match...

In [4]:
dic = {
    'column1': pd.Series([1, 2, 3, 10], index=['a', 'b', 'c', 'h']),
    'column2': pd.Series([3, 4, 5, 6, 7], index=['a', 'b', 'c', 'd', 'e']),
}

In [5]:
pd.DataFrame(dic)

Unnamed: 0,column1,column2
a,1.0,3.0
b,2.0,4.0
c,3.0,5.0
d,,6.0
e,,7.0
h,10.0,


and also pay attention that the data is matched based on **index**, not by their order:

In [6]:
dic = {
    'column1': pd.Series([1, 2, 3], index=['c', 'b', 'a']),
    'column2': pd.Series([3, 4, 5], index=['a', 'b', 'c']),
}

In [7]:
pd.DataFrame(dic)

Unnamed: 0,column1,column2
a,3,3
b,2,4
c,1,5


a --> a<br>
b --> b<br>
c --> c

___

In [8]:
dic = {
    'column1': pd.Series([1, 2, 3], index=['a', 'b', 'c']),
    'column2': pd.Series([3, 4, 5], index=['a', 'b', 'c']),
}

In [9]:
pd.DataFrame(dic, index=['a', 'b']) # it only shows these indexes

Unnamed: 0,column1,column2
a,1,3
b,2,4


In [10]:
pd.DataFrame(dic, index=['a', 'b', 'z'])  

Unnamed: 0,column1,column2
a,1.0,3.0
b,2.0,4.0
z,,


there is no index z, so the value for both columns will be NaN

___

In [11]:
dic = {
    'column1': pd.Series([1, 2, 3], index=['a', 'b', 'c']),
    'column2': pd.Series([3, 4, 5], index=['a', 'b', 'c']),
}

In [12]:
pd.DataFrame(dic, columns=['column1', 'column3'])

Unnamed: 0,column1,column3
a,1,
b,2,
c,3,


there is no 'column3', so the value for all rows in column3 will be Nan

___

you can specify both **index** and **column** :

In [13]:
pd.DataFrame(dic, 
             index=['a', 'b'],
             columns=['column1']
)

Unnamed: 0,column1
a,1
b,2


In [14]:
pd.DataFrame(dic, 
             index=['a', 'b', 'z'],
             columns=['column1', 'xxx']
)

Unnamed: 0,column1,xxx
a,1.0,
b,2.0,
z,,


___

In [15]:
df = pd.DataFrame(dic, 
             index=['a', 'b'],
             columns=['column1']
)
df

Unnamed: 0,column1
a,1
b,2


In [16]:
df.index

Index(['a', 'b'], dtype='object')

In [17]:
df.columns

Index(['column1'], dtype='object')

<br>

## From dict of ndarrays / lists


In [18]:
dic = {
    '1': [1, 2, 3],
    '2': [4, 5, 6],
}
pd.DataFrame(dic, index=range(1, 4))

Unnamed: 0,1,2
1,1,4
2,2,5
3,3,6


## From a list of dicts

In [19]:
data = [
    { # row 1
    'column1': 1, 
    'column2': 2,
    },
    { # row 2
    'column1': 3,
    'column2': 4,
    'column3': 5, # no column3 for row1, so it's value for row1 will be NaN
    }
]

In [20]:
pd.DataFrame(data, index=[1, 2])

Unnamed: 0,column1,column2,column3
1,1,2,
2,3,4,5.0


In [21]:
pd.DataFrame(data, index=(1, 2), columns=['column1', 'column2'])

Unnamed: 0,column1,column2
1,1,2
2,3,4


##### you can have objects of any type in you DataFrame

In [22]:
data1 = [
    {
    'column1': [1, 2, 3], 
    'column2': 2,
    },
    { 
    'column1': {'a': 1},
    'column2': 'hello world',
    'column3': 5.12, 
    }
]

pd.DataFrame(data1)

Unnamed: 0,column1,column2,column3
0,"[1, 2, 3]",2,
1,{'a': 1},hello world,5.12


## From a dict of tuples


You can automatically create a MultiIndexed frame by passing a tuples dictionary.



In [23]:
pd.DataFrame(
    {
        ("a", "b"): {("A", "B"): 1, ("A", "C"): 2},
        ("a", "a"): {("A", "C"): 3, ("A", "B"): 4},
        ("a", "c"): {("A", "B"): 5, ("A", "C"): 6},
        ("b", "a"): {("A", "C"): 7, ("A", "B"): 8},
        ("b", "b"): {("A", "D"): 9, ("A", "B"): 10},
    }
)


Unnamed: 0_level_0,Unnamed: 1_level_0,a,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,b,a,c,a,b
A,B,1.0,4.0,5.0,8.0,10.0
A,C,2.0,3.0,6.0,7.0,
A,D,,,,,9.0


## from_dict

DataFrame.from_dict takes a dict of dicts or a dict of array-like sequences and returns a DataFrame.<br></br>
It operates like the DataFrame constructor except for the **orient** parameter which is **'columns'** by default, but which can be set to **'index'** in order to use the dict keys as row labels.

In [24]:
dic = {
    'a': [1, 2, 3],
    'b': [4, 5, 6],
}

In [25]:
pd.DataFrame.from_dict(dic)

Unnamed: 0,a,b
0,1,4
1,2,5
2,3,6


In [26]:
pd.DataFrame.from_dict(dic, orient='index')

Unnamed: 0,0,1,2
a,1,2,3
b,4,5,6


___

FOR MORE INFORMATION ABOUT THESE CHECK THE MAIN FILE ON GITHUB

## Column selection, addition, deletion


In [27]:
df = pd.DataFrame({
     'name': 
      ['Emma', 'Harry', 'Taylor', 'Allison', 'Leo'],
     
     'age': 
        [20, 20, 28, 30, 35],

     'income': 
        [300, 300, 150, 200, 1000],
    },
    index=range(1, 6)
)

df       

Unnamed: 0,name,age,income
1,Emma,20,300
2,Harry,20,300
3,Taylor,28,150
4,Allison,30,200
5,Leo,35,1000


In [28]:
df['name']

1       Emma
2      Harry
3     Taylor
4    Allison
5        Leo
Name: name, dtype: object

In [29]:
df['rich'] = df['income'] > 200

In [30]:
df

Unnamed: 0,name,age,income,rich
1,Emma,20,300,True
2,Harry,20,300,True
3,Taylor,28,150,False
4,Allison,30,200,False
5,Leo,35,1000,True


In [31]:
df['some_junk_column'] = df['income'] / df['age']
df

Unnamed: 0,name,age,income,rich,some_junk_column
1,Emma,20,300,True,15.0
2,Harry,20,300,True,15.0
3,Taylor,28,150,False,5.357143
4,Allison,30,200,False,6.666667
5,Leo,35,1000,True,28.571429


In [32]:
### deleting a column
del df['some_junk_column']
df

Unnamed: 0,name,age,income,rich
1,Emma,20,300,True
2,Harry,20,300,True
3,Taylor,28,150,False
4,Allison,30,200,False
5,Leo,35,1000,True


___

In [33]:
df['junk'] = df['income'] / df['age']
df

Unnamed: 0,name,age,income,rich,junk
1,Emma,20,300,True,15.0
2,Harry,20,300,True,15.0
3,Taylor,28,150,False,5.357143
4,Allison,30,200,False,6.666667
5,Leo,35,1000,True,28.571429


___

#### del pop()

In [34]:
output = df.pop('junk') # it works exactly like pop in dictionary, it removes a column, and then return it's value

In [35]:
output

1    15.000000
2    15.000000
3     5.357143
4     6.666667
5    28.571429
Name: junk, dtype: float64

In [36]:
df

Unnamed: 0,name,age,income,rich
1,Emma,20,300,True
2,Harry,20,300,True
3,Taylor,28,150,False
4,Allison,30,200,False
5,Leo,35,1000,True


___

In [37]:
df['foo'] = 'bar' 
df

Unnamed: 0,name,age,income,rich,foo
1,Emma,20,300,True,bar
2,Harry,20,300,True,bar
3,Taylor,28,150,False,bar
4,Allison,30,200,False,bar
5,Leo,35,1000,True,bar


___

#### insert

In [38]:
df.insert(1, 'id', np.random.randint(1000, 9999, size=5)) # inserts a new column to the DataFrame
df

Unnamed: 0,name,id,age,income,rich,foo
1,Emma,4412,20,300,True,bar
2,Harry,7116,20,300,True,bar
3,Taylor,5800,28,150,False,bar
4,Allison,9695,30,200,False,bar
5,Leo,7423,35,1000,True,bar
