In [4]:
# Imports 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn

import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.fpm import FPGrowth # Algorítmo para mineiração de dados
from pyspark.sql import SQLContext # Converter o DF pandas e DF SPark
from pyspark.sql.functions import col, round
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer # Preparação específica dos dados
from pyspark.ml.evaluation import MulticlassClassificationEvaluator # Avaliar o modelo
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [5]:
# Carregando os Dados

In [6]:
dados = pd.read_csv("C:/Projetos Pessoais/DataScience/dados/supllyChain/DataCoSupplyChainDataset.csv", encoding= 'latin1')

In [7]:
dados.shape

(180519, 53)

In [10]:
# amostra
dados.head()

Unnamed: 0,Type,Days for shipping (real),Days for shipment (scheduled),Benefit per order,Sales per customer,Delivery Status,Late_delivery_risk,Category Id,Category Name,Customer City,...,Order Zipcode,Product Card Id,Product Category Id,Product Description,Product Image,Product Name,Product Price,Product Status,shipping date (DateOrders),Shipping Mode
0,DEBIT,3,4,91.25,314.640015,Advance shipping,0,73,Sporting Goods,Caguas,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,2/3/2018 22:56,Standard Class
1,TRANSFER,5,4,-249.089996,311.359985,Late delivery,1,73,Sporting Goods,Caguas,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/18/2018 12:27,Standard Class
2,CASH,4,4,-247.779999,309.720001,Shipping on time,0,73,Sporting Goods,San Jose,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/17/2018 12:06,Standard Class
3,DEBIT,3,4,22.860001,304.809998,Advance shipping,0,73,Sporting Goods,Los Angeles,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/16/2018 11:45,Standard Class
4,PAYMENT,2,4,134.210007,298.25,Advance shipping,0,73,Sporting Goods,Caguas,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/15/2018 11:24,Standard Class


In [11]:
dados.columns

Index(['Type', 'Days for shipping (real)', 'Days for shipment (scheduled)',
       'Benefit per order', 'Sales per customer', 'Delivery Status',
       'Late_delivery_risk', 'Category Id', 'Category Name', 'Customer City',
       'Customer Country', 'Customer Email', 'Customer Fname', 'Customer Id',
       'Customer Lname', 'Customer Password', 'Customer Segment',
       'Customer State', 'Customer Street', 'Customer Zipcode',
       'Department Id', 'Department Name', 'Latitude', 'Longitude', 'Market',
       'Order City', 'Order Country', 'Order Customer Id',
       'order date (DateOrders)', 'Order Id', 'Order Item Cardprod Id',
       'Order Item Discount', 'Order Item Discount Rate', 'Order Item Id',
       'Order Item Product Price', 'Order Item Profit Ratio',
       'Order Item Quantity', 'Sales', 'Order Item Total',
       'Order Profit Per Order', 'Order Region', 'Order State', 'Order Status',
       'Order Zipcode', 'Product Card Id', 'Product Category Id',
       'Product De

In [12]:
dados.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180519 entries, 0 to 180518
Data columns (total 53 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   Type                           180519 non-null  object 
 1   Days for shipping (real)       180519 non-null  int64  
 2   Days for shipment (scheduled)  180519 non-null  int64  
 3   Benefit per order              180519 non-null  float64
 4   Sales per customer             180519 non-null  float64
 5   Delivery Status                180519 non-null  object 
 6   Late_delivery_risk             180519 non-null  int64  
 7   Category Id                    180519 non-null  int64  
 8   Category Name                  180519 non-null  object 
 9   Customer City                  180519 non-null  object 
 10  Customer Country               180519 non-null  object 
 11  Customer Email                 180519 non-null  object 
 12  Customer Fname                

In [13]:
# Verifica valores ausentes 
dados.apply(lambda x: sum(x.isnull()))

Type                                  0
Days for shipping (real)              0
Days for shipment (scheduled)         0
Benefit per order                     0
Sales per customer                    0
Delivery Status                       0
Late_delivery_risk                    0
Category Id                           0
Category Name                         0
Customer City                         0
Customer Country                      0
Customer Email                        0
Customer Fname                        0
Customer Id                           0
Customer Lname                        8
Customer Password                     0
Customer Segment                      0
Customer State                        0
Customer Street                       0
Customer Zipcode                      3
Department Id                         0
Department Name                       0
Latitude                              0
Longitude                             0
Market                                0


In [14]:
# Criando nova coluna para o nome do cliente adicionando colunas de nome e sobrenome do cliente
dados['Customer Name'] = dados['Customer Fname'].astype(str) + dados['Customer Lname'].astype(str)

In [15]:
# Removendo colunas que parecem não ser necessárias 
dados = dados.drop(['Product Status',
                    'Customer Password',
                    'Customer Email',
                    'Customer Street',
                    'Customer Fname',
                    'Customer Lname',
                    'Latitude',
                    'Longitude',
                    'Product Description',
                    'Order Zipcode',
                    'shipping date (DateOrders)'],
                    axis = 1)

In [17]:
dados.sample(5)

Unnamed: 0,Type,Days for shipping (real),Days for shipment (scheduled),Benefit per order,Sales per customer,Delivery Status,Late_delivery_risk,Category Id,Category Name,Customer City,...,Order Region,Order State,Order Status,Product Card Id,Product Category Id,Product Image,Product Name,Product Price,Shipping Mode,Customer Name
72051,TRANSFER,4,2,41.049999,113.089996,Late delivery,1,18,Men's Footwear,Washington,...,Central America,Sonora,PROCESSING,403,18,http://images.acmesports.sports/Nike+Men%27s+C...,Nike Men's CJ Elite 2 TD Football Cleat,129.990005,Second Class,MaryBanks
2695,TRANSFER,4,4,1.87,37.400002,Shipping on time,0,7,Hockey,Encinitas,...,US Center,Illinois,PROCESSING,135,7,http://images.acmesports.sports/Nike+Dri-FIT+C...,Nike Dri-FIT Crew Sock 6 Pack,22.0,Standard Class,AnthonySmith
87358,PAYMENT,4,4,133.910004,278.980011,Shipping on time,0,43,Camping & Hiking,Princeton,...,Western Europe,Renania del Norte-Westfalia,PENDING_PAYMENT,957,43,http://images.acmesports.sports/Diamondback+Wo...,Diamondback Women's Serene Classic Comfort Bi,299.980011,Standard Class,JosephStevens
7800,CASH,4,2,-20.879999,104.379997,Late delivery,1,17,Cleats,Caguas,...,South Asia,Jorasán Razaví,CLOSED,365,17,http://images.acmesports.sports/Perfect+Fitnes...,Perfect Fitness Perfect Rip Deck,59.990002,Second Class,EmmaRobinson
45505,PAYMENT,3,4,-215.990005,269.980011,Advance shipping,0,43,Camping & Hiking,Ridgewood,...,Northern Europe,Inglaterra,PENDING_PAYMENT,957,43,http://images.acmesports.sports/Diamondback+Wo...,Diamondback Women's Serene Classic Comfort Bi,299.980011,Standard Class,MarySmith


In [19]:
# Atribuindo zero aos valores ausentes 
dados['Customer Zipcode'] = dados['Customer Zipcode'].fillna(0)

In [23]:
# Tipos de dados 
dados.dtypes

Type                              object
Days for shipping (real)           int64
Days for shipment (scheduled)      int64
Benefit per order                float64
Sales per customer               float64
Delivery Status                   object
Late_delivery_risk                 int64
Category Id                        int64
Category Name                     object
Customer City                     object
Customer Country                  object
Customer Id                        int64
Customer Segment                  object
Customer State                    object
Customer Zipcode                 float64
Department Id                      int64
Department Name                   object
Market                            object
Order City                        object
Order Country                     object
Order Customer Id                  int64
order date (DateOrders)           object
Order Id                           int64
Order Item Cardprod Id             int64
Order Item Disco

In [None]:
# vERIFICANDO 