 # Data Analysis of DotA Matches using Spark

 ## Loading data

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [100]:
data_directory = '/content/gdrive/MyDrive/Downloads/dataInterpolada80.csv'

## Setting up Spark

In [2]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null 

!wget https://dlcdn.apache.org/spark/spark-3.0.3/spark-3.0.3-bin-hadoop2.7.tgz 

!tar xf /content/spark-3.0.3-bin-hadoop2.7.tgz 

!pip install -q findspark

--2021-11-29 02:40:05--  https://dlcdn.apache.org/spark/spark-3.0.3/spark-3.0.3-bin-hadoop2.7.tgz
Resolving dlcdn.apache.org (dlcdn.apache.org)... 151.101.2.132, 2a04:4e42::644
Connecting to dlcdn.apache.org (dlcdn.apache.org)|151.101.2.132|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 220400553 (210M) [application/x-gzip]
Saving to: ‘spark-3.0.3-bin-hadoop2.7.tgz’


2021-11-29 02:40:06 (157 MB/s) - ‘spark-3.0.3-bin-hadoop2.7.tgz’ saved [220400553/220400553]



In [3]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.3-bin-hadoop2.7" 
import findspark
findspark.init("/content/spark-3.0.3-bin-hadoop2.7")# SPARK_HOME
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate() 
#print(os.listdir('./sample_data'))
file_loc = './sample_data/california_housing_train.csv'
df_spark = spark.read.csv(file_loc, inferSchema=True, header=True)
print(type(df_spark))

<class 'pyspark.sql.dataframe.DataFrame'>


In [4]:
SpContext = spark.sparkContext
from pyspark.sql import SQLContext
sqlContext = SQLContext(SpContext)

In [105]:
df = SpContext.textFile(data_directory)
df.take(10)

[',match_id,duration,radiant_win,radiant_gold_adv_at_0,radiant_gold_adv_at_1,radiant_gold_adv_at_2,radiant_gold_adv_at_3,radiant_gold_adv_at_4,radiant_gold_adv_at_5,radiant_gold_adv_at_6,radiant_gold_adv_at_7,radiant_gold_adv_at_8,radiant_gold_adv_at_9,radiant_gold_adv_at_10,radiant_gold_adv_at_11,radiant_gold_adv_at_12,radiant_gold_adv_at_13,radiant_gold_adv_at_14,radiant_gold_adv_at_15,radiant_gold_adv_at_16,radiant_gold_adv_at_17,radiant_gold_adv_at_18,radiant_gold_adv_at_19,radiant_gold_adv_at_20,radiant_gold_adv_at_21,radiant_gold_adv_at_22,radiant_gold_adv_at_23,radiant_gold_adv_at_24,radiant_gold_adv_at_25,radiant_gold_adv_at_26,radiant_gold_adv_at_27,radiant_gold_adv_at_28,radiant_gold_adv_at_29,radiant_gold_adv_at_30,radiant_gold_adv_at_31,radiant_gold_adv_at_32,radiant_gold_adv_at_33,radiant_gold_adv_at_34,radiant_gold_adv_at_35,radiant_gold_adv_at_36,radiant_gold_adv_at_37,radiant_gold_adv_at_38,radiant_gold_adv_at_39,radiant_gold_adv_at_40,radiant_gold_adv_at_41,radiant_gol

In [113]:
dataLines = df.filter(lambda x:"match_id" not in x)
dataLines.count()

2747

In [127]:
from pyspark.sql import Row

def CleanupData(inputStr):
  attList = inputStr.split(",")

  values = Row(
      DURATION=int(attList[2]), 
      RADIANT_WIN=int(1 if attList[3] == "True" else 0), 
      RADIANT_GOLD_ADV=[float(i) for i in attList[4:84]],
      RADIANT_XP_ADV=[float(i) for i in attList[84:]],
  )

  return values

In [128]:
dataMap = dataLines.map(CleanupData)
dataMap.cache()
dataMap.take(5)

[Row(DURATION=1202, RADIANT_WIN=1, RADIANT_GOLD_ADV=[0.0, 16.70886075949367, 33.41772151898734, 50.12658227848102, 55.46835443037979, -155.16455696202522, -365.7974683544304, -576.4303797468355, -755.9746835443038, -655.7215189873418, -555.4683544303798, -455.21518987341784, -342.5822784810126, -159.79746835443038, 22.987341772151865, 205.77215189873402, 322.1265822784811, 172.75949367088623, 23.392405063291335, -125.97468354430356, -278.12658227848067, -438.6329113924047, -599.1392405063286, -759.6455696202527, -906.0253164556963, -1019.4430379746836, -1132.860759493671, -1246.2784810126582, -1282.0759493670887, -1173.7215189873418, -1065.367088607595, -957.0126582278483, -863.848101265823, -793.4683544303799, -723.0886075949369, -652.7088607594939, -552.9367088607598, -417.24050632911434, -281.5443037974688, -145.8481012658233, -21.924050632911822, 90.22784810126535, 202.37974683544257, 314.53164556961974, 612.8481012658206, 1063.4810126582256, 1514.1139240506304, 1964.7468354430355,

In [130]:
dotadf = spark.createDataFrame(dataMap)
display(dotadf)

DataFrame[DURATION: bigint, RADIANT_WIN: bigint, RADIANT_GOLD_ADV: array<double>, RADIANT_XP_ADV: array<double>]

In [129]:
dotadf.select("DURATION", "RADIANT_WIN").describe().show()

+-------+-----------------+------------------+
|summary|         DURATION|       RADIANT_WIN|
+-------+-----------------+------------------+
|  count|             2747|              2747|
|   mean|2245.389879868948|0.5180196578085183|
| stddev|624.0356193958977|0.4997661605188341|
|    min|            615.0|                 0|
|    max|           4787.0|                 1|
+-------+-----------------+------------------+

