In [2]:
import csv
import glob
import sys
import math
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.feature import VectorAssembler,StandardScaler,OneHotEncoderEstimator,StringIndexer
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

In [3]:
sc=SparkContext()
spark=SQLContext(sc)

# Remove NULL Values

In [23]:
def remove_null(dataset,column_list=None):
    #col=["hours"]
    dataset=dataset.na.drop(how='any',subset=column_list)
    return dataset

# Identifying rows with Null values

In [24]:
def identify_null(dataset,column_name=None):
    with_null=dataset
    with_null=with_null.na.fill('Null')
    for i in column_list:
        with_null=with_null.filter(with_null[i] == "Null")
    return with_null

# Remove DUPLICATE Values

In [25]:
def remove_dup(dataset,):
    dataset=dataset.dropDuplicates()
    return dataset

# Trim Space

In [26]:
def trim_space(dataset,column_list=None):
    df=dataset
    column_names=dataset.columns
    for i in column_list:
        df=df.withColumn(i,trim(df[i]))
    return df


# CASE NORMALIZATION


1-Lower Case Normalization                         
2-Upper Case Normalization                        
Converting the contents in columns in upper or lower case

In [27]:
def lower_case(dataset,column_list=None):
    df=dataset
    #column_names=df.columns
    for i in column_list:
        df=df.withColumn(i,lower(df[i]))
    return df

def upper_case(dataset,column_list=None):
    df=dataset
    #column_names=df.columns
    for i in column_list:
        df=df.withColumn(i,lower(df[i]))
    return df

# StringIndexer

In [30]:
def stringIndex(dataset,column_name=None):
    df=dataset
    indexed=df
    i=list(column_name)
    i="location_1/human_address/city"
    d=["location_1/human_address/city","hours"]
    feature_list=[]
    for i in column_name:
        s="features_"+i
        feature_list.append(s)
        indexer = StringIndexer(inputCol=i, outputCol=s)
        indexed = indexer.setHandleInvalid("keep").fit(indexed).transform(indexed)
    return indexed,feature_list

# OneHotEncoding

In [10]:
def encoder(dataset,column_name=None):
    df=dataset
    encoder_columns=[]
    for i in column_name:
        name=i+"_encoded"
        encoder_columns.append(name)
    encoder = OneHotEncoderEstimator(inputCols=column_name,
                        outputCols=encoder_columns)
    model = encoder.fit(df)
    encoded = model.transform(df)
    return encoded,encoder_columns

# VECTOR ASSEMBLER

Converting various columns into single vector for clustering

In [11]:
def vector_assembler(dataset,column_name=None):
    df=dataset
    assembler = VectorAssembler(
        inputCols=column_name,
        outputCol="features")

    output = assembler.transform(df)
    return output

# K-MEANS CLUSTERING

In [33]:
def kmeans_cluster(dataset):
    # Trains a k-means model.
    kmeans = KMeans().setK(2).setSeed(1)
    model = kmeans.fit(dataset)

    # Make predictions
    predictions = model.transform(dataset)

    # Evaluate clustering by computing Silhouette score
    evaluator = ClusteringEvaluator()

    silhouette = evaluator.evaluate(predictions)
    print("Silhouette with squared euclidean distance = " + str(silhouette))

    # Shows the result.
    centers = model.clusterCenters()
    print("Cluster Centers: ")
    for center in centers:
        print(center)
    return centers

# Parsing files one by one

In [31]:
path = 'uncom/*.tsv'
files = glob.glob(path)
#print(files)
count=1
for name in files:
    if(count>9):
        break
    print("File "+str(count),name.split("/")[1])
    dataset = spark.read.format("csv").options(header="true",inferschema="true",delimiter="\t").load(name)
    m=dataset.count()
    dataset=remove_dup(dataset)
    n=dataset.count()
    print("Before cleaning:",m)
    print("After cleaning:",n)
    j=0
    column_names=dataset.columns
    for i in column_names:
        print(str(j)+"-"+i)
        j+=1
    print("Enter the number for the columns you want to transform")
    while(True):
        column_send=[]
        while(True):
            n=int(input())
            s=column_names[n]
            column_send.append(s)
            print("Continue Y-yes N-no")
            decision=input()
            if(decision.upper()=="N"):
                break
            
        
        print("Which tranformation you want to perform")
        print("Trim-T Lowercase-L Uppercase-U RemoveNull-RN IdentifyNull-IN")
        transformation=input()
        if(transformation.upper()=="T"):
            dataset=trim_space(dataset,column_send)
        if(transformation.upper()=="L"):
            dataset=lower_case(dataset,column_send)
        if(transformation.upper()=="U"):
            dataset=upper_case(dataset,column_send)
        if(transformation.upper()=="RN"):
            dataset=remove_null(dataset,column_send)
        if(transformation.upper()=="IN"):
            dataset=identify_null(dataset,column_send)
        print("Continue Y-yes N-no")
        decision=input()
        if(decision.upper()=="N"):
            break
    print("Enter the numbers for columns you want to perform clustering on")
    col=input()
    col=col.split(",")
    indexing_list=[]
    for i in col:
        s=column_names[int(i)]
        indexing_list.append(s)
    result,feature_list=stringIndex(df,indexing_list)
    result,encoded_names=encoder(result,feature_list)
    result=vector_assembler(result,encoded_names)
    kmeans_cluster(result)
    break
            
        
        
        
        
     
    

File 1 rmmq-46n5.tsv
Before cleaning: 39
After cleaning: 39
0-location_1/latitude
1-location_1/human_address/address
2-location_1/human_address/city
3-location_1/human_address/state
4-location_1/human_address/zip
5-location_1/longitude
6-hours
7-requirements
8-website/url
9-months_of_operation
10-citymap_location/url
11-organizer
12-composted_by
13-materials_accepted
14-days
15-location
16-borough
Enter the number for the columns you want to transform
6
Continue Y-yes N-no
n
Which tranformation you want to perform
Trim-T Lowercase-L Uppercase-U RemoveNull-RN IdentifyNull-IN
t
Continue Y-yes N-no
n
Enter the numbers for columns you want to perform clustering on
6,5,0
+--------------------+
|            features|
+--------------------+
|(90,[13,29,87],[1...|
|(90,[5,28,71],[1....|
|(90,[20,23,58],[1...|
|(90,[0,24,57],[1....|
|(90,[12,50,86],[1...|
|(90,[0,38,73],[1....|
|(90,[11,23,58],[1...|
|(90,[8,45,66],[1....|
|(90,[15,25,56],[1...|
|(90,[1,25,56],[1....|
|(90,[7,47,88],[1....|
|(9