In [1]:
import findspark
findspark.init()

import pandas as pd 
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as fn

from pyspark.sql.functions import udf

In [2]:
def equivalent_type(f):
    if f == 'datetime64[ns]': return DateType()
    elif f == 'int64': return LongType()
    elif f == 'int32': return IntegerType()
    elif f == 'float64': return FloatType()
    else: return StringType()

def define_structure(string, format_type):
    try: typo = equivalent_type(format_type)
    except: typo = StringType()
    return StructField(string, typo)


# Given pandas dataframe, it will return a spark's dataframe.
def pandas_to_spark(pandas_df,sparkSession):
    columns = list(pandas_df.columns)
    types = list(pandas_df.dtypes)
    struct_list = []
    i = 0
    for column, typo in zip(columns, types): 
        struct_list.append(define_structure(column, typo))
    p_schema = StructType(struct_list)
    return sparkSession.createDataFrame(pandas_df, p_schema)

In [3]:
# start spark engine 
conf = pyspark.SparkConf().setAppName('appName').setMaster('local')
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession(sc)

In [6]:
# read csv 
df_cli = spark \
    .read.format("com.databricks.spark.csv")\
    .option("header", "true")\
    .option("inferschema", "true")\
    .option("delimiter", ",")\
    .load("../../data/data/clients.csv")

In [12]:
# read csv 
df_up_train = spark \
    .read.format("com.databricks.spark.csv")\
    .option("header", "true")\
    .option("inferschema", "true")\
    .option("delimiter", ",")\
    .load("../../data/data/uplift_train.csv")

In [7]:
# read csv 
df_pro = spark \
    .read.format("com.databricks.spark.csv")\
    .option("header", "true")\
    .option("inferschema", "true")\
    .option("delimiter", ",")\
    .load("../../data/data/products.csv")

In [8]:
# read csv 
df_pur = spark \
    .read.format("com.databricks.spark.csv")\
    .option("header", "true")\
    .option("inferschema", "true")\
    .option("delimiter", ",")\
    .load("../../data/data/purchases.csv")

In [9]:
df_pur_gb = df_pur.groupby('client_id').agg(fn.sum('purchase_sum').alias('purchase_sum'))

In [10]:
df_pur_gb.limit(3).toPandas()

Unnamed: 0,client_id,purchase_sum
0,001ecff0a8,154014.37
1,006391ff01,81622.52
2,0068dd084d,163266.52


In [11]:
df_cli.limit(3).toPandas()

Unnamed: 0,client_id,first_issue_date,first_redeem_date,age,gender
0,000012768d,2017-08-05 15:40:48,2018-01-04 19:30:07,45,U
1,000036f903,2017-04-10 13:54:23,2017-04-23 12:37:56,72,F
2,000048b7a6,2018-12-15 13:33:11,NaT,68,F


In [19]:
df_up_train.limit(3).toPandas()

Unnamed: 0,client_id,treatment_flg,target
0,000012768d,0,1
1,000036f903,1,1
2,00010925a5,1,1


In [20]:
df_up_train_summ = df_up_train.groupby('treatment_flg', 'target').agg(fn.count('client_id').alias('n_client'))

In [25]:
df_up_train_summ.show()

+-------------+------+--------+
|treatment_flg|target|n_client|
+-------------+------+--------+
|            1|     0|   36342|
|            1|     1|   63639|
|            0|     0|   39695|
|            0|     1|   60363|
+-------------+------+--------+



In [None]:
spark.stop()