In [1]:
import pandas as pd
import prose.codeaccelerator as cx

# Demo 1 - Parameters Detection

### Attempting to load a CSV using pandas. Notice it errors due to encoding mismatch. 

In [2]:
file = 'Sentiment/Part1/BenSent.CSV'

df = pd.read_csv(file)

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x97 in position 35: invalid start byte

### Using Prose to scan the CSV and recommend parameter values to load the CSV.

In [3]:
builder = cx.ReadCsvBuilder(file)
result = builder.learn()
result.preview_data
result.code()

import pandas as pd


def read_file(file):
    names = [
        "Subject",
        "Body",
        "From_Name",  # "From: (Name)"
        "From_Address",  # "From: (Address)"
        "From_Type",  # "From: (Type)"
        "To_Name",  # "To: (Name)"
        "To_Address",  # "To: (Address)"
        "To_Type",  # "To: (Type)"
        "CC_Name",  # "CC: (Name)"
        "CC_Address",  # "CC: (Address)"
        "CC_Type",  # "CC: (Type)"
        "BCC_Name",  # "BCC: (Name)"
        "BCC_Address",  # "BCC: (Address)"
        "BCC_Type",  # "BCC: (Type)"
        "Billing_Information",  # "Billing Information"
        "Categories",
        "Importance",
        "Mileage",
        "Sensitivity",
    ]

    df = pd.read_csv(
        file,
        encoding="windows-1252",
        skiprows=1,
        header=None,
        names=names,
        quotechar='"',
        delimiter=",",
        index_col=False,
        dtype=str,
        na_values=[],
        keep_default_na=False,
        skipinitialspac

### Loading the CSV using the recommendation from Prose. Notice the inclusion of encoding = 'windows-1252' .

### CSV now loads correctly.

In [4]:
df = pd.read_csv(file,encoding='windows-1252')
df.head()

Unnamed: 0,Subject,Body,From: (Name),From: (Address),From: (Type),To: (Name),To: (Address),To: (Type),CC: (Name),CC: (Address),CC: (Type),BCC: (Name),BCC: (Address),BCC: (Type),Billing Information,Categories,Importance,Mileage,Sensitivity
0,Re: ACTION REQUIRED: Missing Timecard for w/e ...,Since they didn't go over timecard entries dur...,Ben Prescott,/o=ExchangeLabs/ou=Exchange Administrative Gro...,EX,Derek Smith,/o=ExchangeLabs/ou=Exchange Administrative Gro...,EX,,,,,,,,,Normal,,Normal
1,Re: Missing skills on Resouce Request,I went through all the ones I've heard of and ...,Ben Prescott,/O=EXCHANGELABS/OU=EXCHANGE ADMINISTRATIVE GRO...,EX,Ryan Heringhaus,/o=ExchangeLabs/ou=Exchange Administrative Gro...,EX,Dan Wittenberg;Mickey Weibeler;Krista Meschino,/o=ExchangeLabs/ou=Exchange Administrative Gro...,EX;EX;EX,,,,,,Normal,,Normal
2,Re: New Payroll Schedule - Missing Deposit,"Got it, looks like it just came through. Thank...",Ben Prescott,/O=EXCHANGELABS/OU=EXCHANGE ADMINISTRATIVE GRO...,EX,Michael Hoehne,/o=ExchangeLabs/ou=Exchange Administrative Gro...,EX,AHEAD Human Resources,hr@thinkahead.com,SMTP,,,,,,Normal,,Normal
3,Re: New Payroll Schedule - Missing Deposit,"Any update on this? \r\n\r\nRegards,\r\nBen\r\...",Ben Prescott,/O=EXCHANGELABS/OU=EXCHANGE ADMINISTRATIVE GRO...,EX,Michael Hoehne,/o=ExchangeLabs/ou=Exchange Administrative Gro...,EX,AHEAD Human Resources,hr@thinkahead.com,SMTP,,,,,,Normal,,Normal
4,Re: New Payroll Schedule - Missing Deposit,Thanks Michael. I have some others on my team...,Ben Prescott,/O=EXCHANGELABS/OU=EXCHANGE ADMINISTRATIVE GRO...,EX,Michael Hoehne,/o=ExchangeLabs/ou=Exchange Administrative Gro...,EX,AHEAD Human Resources,hr@thinkahead.com,SMTP,,,,,,Normal,,Normal


# Demo 2 - Types Transformation

### Load CSV and relabeling columns for ease

In [5]:
df2 = pd.read_csv('Sentiment/Part2/rescuetime-history.csv')
rescue_cols = ['date','app','details','category','apptype','usage']
df2.columns = rescue_cols
df2.head()

Unnamed: 0,date,app,details,category,apptype,usage
0,2019-08-19 09:00:00 -0700,thinkaheadit.sharepoint.com,No Details,Uncategorized,Uncategorized,66
1,2019-08-19 09:00:00 -0700,msedge,No Details,Utilities,Browsers,56
2,2019-08-19 09:00:00 -0700,rescuetime.com,RescueTime - Privacy settings - Google Chrome,Business,Intelligence,43
3,2019-08-19 09:00:00 -0700,rescuetime.com,RescueTime - Your Daily dashboard - Google Chrome,Business,Intelligence,27
4,2019-08-19 09:00:00 -0700,linkedin.com,Ryan Heringhaus - Director of America's Techni...,Social Networking,Professional Networking,19


### Reviewing initial load dtypes. Notice 'usage' column is of int64 type.

In [6]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52970 entries, 0 to 52969
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   date      52970 non-null  object
 1   app       52952 non-null  object
 2   details   52970 non-null  object
 3   category  52970 non-null  object
 4   apptype   52970 non-null  object
 5   usage     52970 non-null  int64 
dtypes: int64(1), object(5)
memory usage: 2.4+ MB


### Purposefully converting 'usage' column to string from integer. Usage is a number and should be integer, but we'll act as if it wasn't correct from the start.

In [7]:
df2['usage'] = df2['usage'].astype(str)
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52970 entries, 0 to 52969
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   date      52970 non-null  object
 1   app       52952 non-null  object
 2   details   52970 non-null  object
 3   category  52970 non-null  object
 4   apptype   52970 non-null  object
 5   usage     52970 non-null  object
dtypes: object(6)
memory usage: 2.4+ MB


### Using Prose to review dtypes and convert based on it's understanding.

### Notice how it converted 'usage' to int64 because it noticed the data should be an integer type.

In [8]:
builder = cx.DetectTypesBuilder(df2)
result = builder.learn()
transformation_code = result.code()
df2 = transformation_code(df2)
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52970 entries, 0 to 52969
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   date      52970 non-null  object
 1   app       52952 non-null  object
 2   details   52970 non-null  object
 3   category  52970 non-null  object
 4   apptype   52970 non-null  object
 5   usage     52970 non-null  int64 
dtypes: int64(1), object(5)
memory usage: 2.4+ MB
