# Understanding Data

In [None]:
# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf

# plt.style.use('seaborn-whitegrid')
plt.rcParams['figure.figsize'] = [11, 7]

## cust-wtransactions data

In [None]:
data = pd.read_excel('cust-wtransactions.xlsx')
print(data.info(), '\n')

# How many stores have more than 300 customers?
store_count = list(data['Store Number'].value_counts())
more_300_customers = [x for x in store_count if x > 300]
print('How many stores have more than 300 customers?\n', len(more_300_customers), '\n')

# Which city has the most customers?
city_count = data['City'].value_counts()
print('Which city has the most customers?\n', city_count)

## transactions data

In [None]:
from bs4 import BeautifulSoup

file_name = 'transactions.xml'
with open(file_name, encoding='utf8') as f:
    xml = f.read()

# parsing
soup = BeautifulSoup(xml, 'xml')
transactions = soup.find_all('Transaction')

transaction = []
for e in transactions:
    transaction.append([e['Customer_ID'], e['Product_Name'], e['Sales'], e['Order_ID'], e.text])

# create pandas dataframe
columns_name = ['Customer_ID', 'Product_Name', 'Sales', 'Order_ID', 'Transaction']
df = pd.DataFrame(transaction, columns=columns_name).replace('', 0)
df = df.astype({'Customer_ID': 'int', 'Order_ID': 'int', 'Transaction': 'int', 'Sales': 'float'})
print(df.info(), '\n')

# what are the total sales for Order 10080?
grouped_order_id = df.groupby('Order_ID', as_index=False)['Sales'].sum()
print('Grouped by Order ID and Summ all sales \n')
print(grouped_order_id, '\n')

order_id_10080 = grouped_order_id[grouped_order_id['Order_ID'] == 10080]
print('Filter by order id 10080\n')
print(order_id_10080, '\n')
print('\nwhat are the total sales for Order 10080? ', order_id_10080['Sales'].values[0])


## FUzzyData2

In [None]:
fuzzy_data = pd.read_csv('fuzzydata2.csv')
print(fuzzy_data.info(), '\n')

grouped_contact = fuzzy_data.groupby('CONTACT', as_index=False)['PHONE_NEW'].count()
grouped_contact_sorted = grouped_contact.sort_values(by=['PHONE_NEW'], ascending=False)

print('Contact with the most duplicates')
print(grouped_contact_sorted.iloc[0])


## Missing Data

In [None]:
missing_data = pd.read_excel('missingdata.xlsx')
print(missing_data.head(), '\n')

# replace empty cell with NAN
missing_data_with_nan = missing_data.replace(' ', np.nan)
print(missing_data_with_nan.head(), '\n')

# Drop NAN
print('Drop rows with NAN')
print(missing_data_with_nan.dropna())


## Dealing with Missing Data

In [None]:
deal_with_missing_data = pd.read_excel('dealing-with-missing-data.xlsx')
print(deal_with_missing_data.info(), '\n')

# drop NAN
deal_with_missing_data_dropna = deal_with_missing_data.dropna()

print('After removing the records, how many records are left in the dataset?')
len(deal_with_missing_data_dropna)

# scatter plot
deal_with_missing_data.plot.scatter(x='Number of Employees', y='Average Number of Tickets', title='With Missing Data')
deal_with_missing_data_dropna.plot.scatter(x='Number of Employees', y='Average Number of Tickets', title='No Missing Data')