# Notebook 6- Data Transformation in Pandas

<img src="https://raw.githubusercontent.com/fralfaro/DS-Cheat-Sheets/main/docs/examples/pandas/pandas.png" alt="numpy logo" width = "300">


## Importing pandas

In [None]:
import pandas as pd

## Handling duplicates

In [None]:
data=pd.DataFrame({"a":["one","two"]*3,
                   "b":[1,1,2,3,2,3]})
data

In [None]:
data.duplicated()

In [None]:
data[data.duplicated()]

In [None]:
data[data.duplicated()].shape

In [None]:
data.drop_duplicates()

In [None]:
data["c"]=range(6)
data

In [None]:
data.duplicated(["a","b"])

In [None]:
data.duplicated(["a","b"],keep="last")

In [None]:
data.drop_duplicates(["a","b"],keep="last")

## Mapping

In [None]:
df=pd.DataFrame({"names":["Tim","tom","Sam",
                          "kate","Kim"],
                "scores":[60,50,70,80,40]})
df

In [None]:
n=df["names"].str.capitalize()

In [None]:
n

In [None]:
classes={"Tim":"A","Tom":"A","Sam":"B",
         "Kate":"B","Kim":"B"}

In [None]:
df["classes"]=n.map(classes)

In [None]:
df

## Replace values

In [None]:
s=pd.Series([80,70,90,60])
s

In [None]:
s.replace(70,np.nan)

In [None]:
s.replace([70,60],[np.nan,0])

In [None]:
s.replace({90:100,60:0})

## Mapping indexes

In [None]:
df=pd.DataFrame(
    np.arange(12).reshape(3,4),
    index=[0,1,2],
    columns=["tim","tom","kim","sam"])
df

In [None]:
s=pd.Series(["one","two","three"])
df.index=df.index.map(s)

## Renaming

In [None]:
df

In [None]:
df.rename(index=str.title,columns=str.upper)

In [None]:
df.rename(index={"one":"ten"},
          columns={"sam":"kate"},
          inplace=True)
df

## Binning

<img src="https://miro.medium.com/max/2000/1*LGTAObYYj2-fdBMFLz30rw.jpeg" width = 500>

In [None]:
sc=[10,30,80,40,90,60,45,95,75,55,100,65,85]

In [None]:
x=[20,40,60,80,100]

In [None]:
y=pd.cut(sc,x)
y

In [None]:
y.codes

In [None]:
y.categories

In [None]:
pd.value_counts(y)

In [None]:
y=pd.cut(sc,x,right=False)
y

In [None]:
nm=["low", "medium", "high", "very high"]
pd.cut(sc,x,labels=nm)

In [None]:
pd.cut(sc,10)

## Quantile cutting

In [None]:
data=np.random.randn(100)
c=pd.qcut(data,4)
c

In [None]:
pd.value_counts(c)

## Handling the signs

In [None]:
data=pd.DataFrame(np.random.randn(1000,4))
data.head()

In [None]:
data.describe()

In [None]:
col=data[1]

In [None]:
col

In [None]:
col[np.abs(col)>3]

In [None]:
np.sign(data).head()

## Sampling

In [None]:
data=pd.DataFrame(
    np.arange(12).reshape(4,3))
data

In [None]:
rw=np.random.permutation(3)
rw

In [None]:
data.take(rw)

In [None]:
data.sample()

In [None]:
data.sample(n=3)

In [None]:
data.sample(n=2,axis=1)

## Handling categorical Variable

## One hot encoding

In [None]:
data=pd.DataFrame(
    {"letter":["c","b","a","b","b","a"],
                   "number":range(6)})
data

In [None]:
pd.get_dummies(data["letter"])

In [None]:
data=np.random.randn(10)
data

In [None]:
pd.get_dummies(pd.cut(data,4))

## ordinal encoding

In [None]:
df = pd.DataFrame({ 
                    "Name": ["Ahmed", "Mahmoud", "Moustafa", "Mohamed", "Yassin", "Abd el Rahman", "Emam", "Nabil", "Saad"],   
                    "Score": ["Low", "Low", "Medium", "Medium", "High", "Low", "Medium","High", "Low"],
                    "Age": [33, 34, 27, 3, 1, 32, 34, 36, 33]
                })

In [None]:
df

In [None]:
df['Score'].value_counts()

In [None]:
ordinal_encoding_mapping = {"Low":1, "Medium":2, "High":3}
df["Score"] = df["Score"].replace(ordinal_encoding_mapping)
df

In [None]:
df.info()