In [1]:
import pandas as pd

In [40]:
people = {
    'first' : ['Arun', 'Raj', 'Ron', 'Biswa'],
    'last' : ['Ghoshal', 'Aryan', 'Quantum', 'Upadhyay'],
    'email' : ['abc@xyz.com', 'def@ghi.com', 'mno@pqr.com', None]
}

In [41]:
people['email']

['abc@xyz.com', 'def@ghi.com', 'mno@pqr.com', None]

In [49]:
df = pd.DataFrame(people)
df.head()

Unnamed: 0,first,last,email
0,Arun,Ghoshal,abc@xyz.com
1,Raj,Aryan,def@ghi.com
2,Ron,Quantum,mno@pqr.com
3,Biswa,Upadhyay,


In [43]:
df['email'].describe()

count               3
unique              3
top       abc@xyz.com
freq                1
Name: email, dtype: object

```DataFrame is like a container of multiple Series objects```

In [20]:
type(df['email'])

pandas.core.series.Series

In [21]:
df.email

0    abc@xyz.com
1    def@ghi.com
2    mno@pqr.com
3           None
Name: email, dtype: object

In [22]:
df[['last', 'email']]

Unnamed: 0,last,email
0,Ghoshal,abc@xyz.com
1,Aryan,def@ghi.com
2,Quantum,mno@pqr.com
3,Upadhyay,


In [23]:
df.columns

Index(['first', 'last', 'email'], dtype='object')

```
* iloc -> refers to row by an integer value
* loc -> refers to row by name
```


In [27]:
df.iloc[[0,1],2]    #the second parameter #2 is specifying the column number to display

0    abc@xyz.com
1    def@ghi.com
Name: email, dtype: object

In [30]:
df.loc[[0,2],['first', 'last']]

Unnamed: 0,first,last
0,Arun,Ghoshal
2,Ron,Quantum


In [50]:
df.iloc[3,2] = 'abc@xyz.com'
df.set_index('email',inplace=True)

In [51]:
df

Unnamed: 0_level_0,first,last
email,Unnamed: 1_level_1,Unnamed: 2_level_1
abc@xyz.com,Arun,Ghoshal
def@ghi.com,Raj,Aryan
mno@pqr.com,Ron,Quantum
abc@xyz.com,Biswa,Upadhyay


In [53]:
df.loc['abc@xyz.com','last']

email
abc@xyz.com     Ghoshal
abc@xyz.com    Upadhyay
Name: last, dtype: object

In [54]:
df.reset_index(inplace=True)
df

Unnamed: 0,email,first,last
0,abc@xyz.com,Arun,Ghoshal
1,def@ghi.com,Raj,Aryan
2,mno@pqr.com,Ron,Quantum
3,abc@xyz.com,Biswa,Upadhyay


In [66]:
# filt = (df['email'] == 'abc@xyz.com' ) & (df['first'] == 'Biswa')
filt = (df['email'] == 'abc@xyz.com' ) | (df['first'] == 'Biswa')
df[filt]

Unnamed: 0,email,first,last
0,abc@xyz.com,Arun,Ghoshal
3,abc@xyz.com,Biswa,Upadhyay


In [67]:
df.loc[filt]

Unnamed: 0,email,first,last
0,abc@xyz.com,Arun,Ghoshal
3,abc@xyz.com,Biswa,Upadhyay


In [68]:
df.loc[filt, 'last']

0     Ghoshal
3    Upadhyay
Name: last, dtype: object

In [71]:
df.loc[~filt, 'last']

1      Aryan
2    Quantum
Name: last, dtype: object

In [72]:
df.columns

Index(['email', 'first', 'last'], dtype='object')

In [73]:
df.columns = ['email', 'first_name', 'last_name']

In [74]:
df

Unnamed: 0,email,first_name,last_name
0,abc@xyz.com,Arun,Ghoshal
1,def@ghi.com,Raj,Aryan
2,mno@pqr.com,Ron,Quantum
3,abc@xyz.com,Biswa,Upadhyay


In [75]:
df.columns = [col.upper() for col in df.columns]
df

Unnamed: 0,EMAIL,FIRST_NAME,LAST_NAME
0,abc@xyz.com,Arun,Ghoshal
1,def@ghi.com,Raj,Aryan
2,mno@pqr.com,Ron,Quantum
3,abc@xyz.com,Biswa,Upadhyay


In [76]:
df.rename(columns={'FIRST_NAME':'first', 'LAST_NAME':'last'}, inplace=True)
df

Unnamed: 0,EMAIL,first,last
0,abc@xyz.com,Arun,Ghoshal
1,def@ghi.com,Raj,Aryan
2,mno@pqr.com,Ron,Quantum
3,abc@xyz.com,Biswa,Upadhyay


In [77]:
df.rename(columns={'EMAIL':'email'}, inplace=True)
df

Unnamed: 0,email,first,last
0,abc@xyz.com,Arun,Ghoshal
1,def@ghi.com,Raj,Aryan
2,mno@pqr.com,Ron,Quantum
3,abc@xyz.com,Biswa,Upadhyay


In [78]:
df.loc[2] = ['leo@messi.com', 'Leo', 'Messi']
df

Unnamed: 0,email,first,last
0,abc@xyz.com,Arun,Ghoshal
1,def@ghi.com,Raj,Aryan
2,leo@messi.com,Leo,Messi
3,abc@xyz.com,Biswa,Upadhyay


In [79]:
df.loc[3,['first', 'last']] = ['MS', 'Dhoni']
df

Unnamed: 0,email,first,last
0,abc@xyz.com,Arun,Ghoshal
1,def@ghi.com,Raj,Aryan
2,leo@messi.com,Leo,Messi
3,abc@xyz.com,MS,Dhoni


```
Try using .loc[row_indexer,col_indexer] = value instead
```

In [83]:
filt = (df['email'] == 'abc@xyz.com')
df[filt]['first'] = 'Mahendra Singh'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[filt]['first'] = 'Mahendra Singh'


In [85]:
filt = (df['email'] == 'abc@xyz.com')
df.loc[filt,'first'] = 'Mahendra Singh'

In [88]:
df.loc[3,'email'] = df.loc[3,'email'].upper()
df

Unnamed: 0,email,first,last
0,abc@xyz.com,Mahendra Singh,Ghoshal
1,def@ghi.com,Raj,Aryan
2,leo@messi.com,Leo,Messi
3,ABC@XYZ.COM,Mahendra Singh,Dhoni


In [89]:
df['email'] = df['email'].str.upper()
df

Unnamed: 0,email,first,last
0,ABC@XYZ.COM,Mahendra Singh,Ghoshal
1,DEF@GHI.COM,Raj,Aryan
2,LEO@MESSI.COM,Leo,Messi
3,ABC@XYZ.COM,Mahendra Singh,Dhoni


In [91]:
df['email'].apply(len)

0    11
1    11
2    13
3    11
Name: email, dtype: int64

In [92]:
def update_email(email):
    return email.upper()

df['email'] = df['email'].apply(update_email)
df

Unnamed: 0,email,first,last
0,ABC@XYZ.COM,Mahendra Singh,Ghoshal
1,DEF@GHI.COM,Raj,Aryan
2,LEO@MESSI.COM,Leo,Messi
3,ABC@XYZ.COM,Mahendra Singh,Dhoni


In [93]:
df['email'] = df['email'].apply(lambda x:x.lower())
df

Unnamed: 0,email,first,last
0,abc@xyz.com,Mahendra Singh,Ghoshal
1,def@ghi.com,Raj,Aryan
2,leo@messi.com,Leo,Messi
3,abc@xyz.com,Mahendra Singh,Dhoni


In [97]:
df.apply(len, axis='columns')

0    3
1    3
2    3
3    3
dtype: int64

In [98]:
df.apply(pd.Series.min)

email    abc@xyz.com
first            Leo
last           Aryan
dtype: object

In [99]:
df.apply(lambda x:x.min())

email    abc@xyz.com
first            Leo
last           Aryan
dtype: object

In [100]:
df.applymap(len)

Unnamed: 0,email,first,last
0,11,14,7
1,11,3,5
2,13,3,5
3,11,14,5


In [101]:
df.apply(len)

email    4
first    4
last     4
dtype: int64

In [58]:
import pandas as pd
import re
import numpy as np

In [54]:
f = open('raj_0.txt', encoding='utf-8')
header = re.split(',|\n',f.read())
f = open('raj_0.txt', encoding='utf-8')
val = re.split('\n',f.read())
f.close()
print(header)
print(val)
#df = pd.DataFrame()

['a:1', 'b:1', 'c:1', 'd:1', 'e:1', 'h:1', 'a:2', 'e:2', 'd:2', 'h:2', 'f:2', 'c:3', 'e:3', 'd:3', 'h:3', 'f:3', 'a:4', 'b:4', 'c:4', 'e:4', 'h:4', 'f:4', 'i:4', 'j:4']
['a:1,b:1,c:1,d:1,e:1,h:1', 'a:2,e:2,d:2,h:2,f:2', 'c:3,e:3,d:3,h:3,f:3', 'a:4,b:4,c:4,e:4,h:4,f:4,i:4,j:4']


In [55]:
data = {}
for _ in header:
    if _.split(',')[0] not in data:
        data[_.split(':')[0]] = []
data


{'a': [],
 'b': [],
 'c': [],
 'd': [],
 'e': [],
 'h': [],
 'f': [],
 'i': [],
 'j': []}

In [59]:
count = 0
for value in val:
    elem = re.split(',|:',value)
    for _ in data:
        if _ in elem:
            data[_].append(count+1)
        else:
            data[_].append(np.nan)
    count += 1
data

{'a': [1, 2, nan, 4],
 'b': [1, nan, nan, 4],
 'c': [1, nan, 3, 4],
 'd': [1, 2, 3, nan],
 'e': [1, 2, 3, 4],
 'h': [1, 2, 3, 4],
 'f': [nan, 2, 3, 4],
 'i': [nan, nan, nan, 4],
 'j': [nan, nan, nan, 4]}

In [65]:
df = pd.DataFrame(data, dtype=object)
df.head(10)

Unnamed: 0,a,b,c,d,e,h,f,i,j
0,1.0,1.0,1.0,1.0,1,1,,,
1,2.0,,,2.0,2,2,2.0,,
2,,,3.0,3.0,3,3,3.0,,
3,4.0,4.0,4.0,,4,4,4.0,4.0,4.0


In [64]:
f = open('raj_0.txt', encoding='utf-8')
r_d = f.read()
f.close()
header = re.split(',|\n',r_d)
val = re.split('\n',r_d)
print(header,val)

['a:1', 'b:1', 'c:1', 'd:1', 'e:1', 'h:1', 'a:2', 'e:2', 'd:2', 'h:2', 'f:2', 'c:3', 'e:3', 'd:3', 'h:3', 'f:3', 'a:4', 'b:4', 'c:4', 'e:4', 'h:4', 'f:4', 'i:4', 'j:4'] ['a:1,b:1,c:1,d:1,e:1,h:1', 'a:2,e:2,d:2,h:2,f:2', 'c:3,e:3,d:3,h:3,f:3', 'a:4,b:4,c:4,e:4,h:4,f:4,i:4,j:4']
