# Introduction to ptype-cat

Here, we present a simple scenario to show our motivation behind ptype-cat.

In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcdefaults()

import numpy as np
import pandas as pd
pd.options.display.max_colwidth = 20

from ptype.Ptype import Ptype
from ptype.PtypeCat import PtypeCat

## 1. Using Standard Python Libraries

In [2]:
columns = ["Class Name", "Review Text", "Age", "Rating"]
df = pd.read_csv('../data/womens-clothing-reviews.csv', usecols=columns)[columns]
df.head()

Unnamed: 0,Class Name,Review Text,Age,Rating
0,Intimates,Absolutely wonde...,33,4
1,Dresses,Love this dress!...,34,5
2,Dresses,I had such high ...,60,3
3,Pants,"I love, love, lo...",50,5
4,Blouses,This shirt is ve...,47,5


In [3]:
df.dtypes

Class Name     object
Review Text    object
Age             int64
Rating          int64
dtype: object

### Converting to Pandas Categorical Type

In [4]:
for col in ["Class Name", "Rating"]:
    df[col] = df[col].astype('category')
df.head()

Unnamed: 0,Class Name,Review Text,Age,Rating
0,Intimates,Absolutely wonde...,33,4
1,Dresses,Love this dress!...,34,5
2,Dresses,I had such high ...,60,3
3,Pants,"I love, love, lo...",50,5
4,Blouses,This shirt is ve...,47,5


In [5]:
df.dtypes

Class Name     category
Review Text      object
Age               int64
Rating         category
dtype: object

## 2. Using ptype

In [6]:
columns = ["Class Name", "Review Text", "Age", "Rating"]
df = pd.read_csv('../data/womens-clothing-reviews.csv', usecols=columns, dtype='str', keep_default_na=False)[columns]
df.head()

Unnamed: 0,Class Name,Review Text,Age,Rating
0,Intimates,Absolutely wonde...,33,4
1,Dresses,Love this dress!...,34,5
2,Dresses,I had such high ...,60,3
3,Pants,"I love, love, lo...",50,5
4,Blouses,This shirt is ve...,47,5


In [7]:
ptype = Ptype()

schema = ptype.schema_fit(df)
schema.show()

Unnamed: 0,Class Name,Review Text,Age,Rating
type,string,string,integer,integer
normal values,"[Blouses, Casual...",[7 months pregna...,"[18, 19, 20, 21,...","[1, 2, 3, 4, 5]"
missing values,[ε],[ε],[],[]
anomalous values,[],"[""does this swea...",[],[]
(empty string marker),ε,ε,,


## 3. Using ptype-cat

In [8]:
columns = ["Class Name", "Review Text", "Age", "Rating"]
df = pd.read_csv('../data/womens-clothing-reviews.csv', usecols=columns, dtype='str', keep_default_na=False)[columns]
df.head()

Unnamed: 0,Class Name,Review Text,Age,Rating
0,Intimates,Absolutely wonde...,33,4
1,Dresses,Love this dress!...,34,5
2,Dresses,I had such high ...,60,3
3,Pants,"I love, love, lo...",50,5
4,Blouses,This shirt is ve...,47,5


In [9]:
ptype_cat = PtypeCat()

schema = ptype_cat.schema_fit(df)
schema.show()

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


Unnamed: 0,Class Name,Review Text,Age,Rating
type,categorical,string,integer,categorical
normal values,"[Blouses, Casual...",[7 months pregna...,"[18, 19, 20, 21,...","[1, 2, 3, 4, 5]"
missing values,[ε],[ε],[],[]
anomalous values,[],"[""does this swea...",[],[]
(empty string marker),ε,ε,,
