In [3]:
from spamminess import spamminess
from math import exp

# initial empty dictionary mapping features to real-valued weights.   This is the model
w = {}

# model update parameter, empirically chosen
delta = 0.002

# open one of the training files - in this case, group_x
with open('/u/cs451/public_html/spam/spam.train.group_x.txt') as f:
    for line in f:
#       each line represents a document
#       read and parse the line
#       Let:
#         t represent the spam/ham tag for this document
#         F represent the list of features for this document
        a=line.split()
        t=a[1]
        F=a[2:]
#       find the spamminess of the current document using the current model:
        score = spamminess(F,w)
#       then, update the model:
        prob = 1.0/(1+exp(-score))
#       for each feature f in F:
#           if t == 'spam':
#               increase w(f) by (1.0-prob)*delta (or set w(f) to (1.0-prob)*delta if f is not in the dict yet)
#           elif t == 'ham':
#               decrease w(f) by prob*delta (or set w(f) to -prob*delta if f is not in the dict yet)
        for feature in F:
            if t == 'spam':              
                w[feature]=w.get(feature,0)+(1.0-prob)*delta    
            elif t == 'ham':
                w[feature]=w.get(feature,0)-prob*delta
                 
result1= sorted(w.items(), key=lambda t_dic:t_dic[1], reverse = True)[0:5]
result2= sorted(w.items(), key=lambda t_dic:t_dic[1], reverse = False)[0:5]
print("5 features with the highest scores:")
print(result1)
print("5 features with the lowest scores:")
print(result2)

5 features with the highest scores:
[('288281', 0.022996007768337472), ('316070', 0.02178760768974702), ('305568', 0.02166991395934579), ('737304', 0.02155627086676939), ('102264', 0.021381723804112546)]
5 features with the lowest scores:
[('358032', -0.0243996483857188), ('585043', -0.023067846652662177), ('402756', -0.02230073520257498), ('253515', -0.021992394685373944), ('646100', -0.021927190429925553)]


In [4]:
w

{'387908': 0.00695474119501872,
 '697162': 0.00695474119501872,
 '426572': 0.00695474119501872,
 '161118': 0.00695474119501872,
 '688171': 0.00695474119501872,
 '43992': 0.00695474119501872,
 '908749': 0.00695474119501872,
 '126841': 0.00695474119501872,
 '116309': 0.00695474119501872,
 '950728': 0.00695474119501872,
 '394920': 0.00695474119501872,
 '358453': 0.00695474119501872,
 '23565': 0.00695474119501872,
 '210162': 0.01394765848836145,
 '861926': 0.013942427517810815,
 '177667': 0.00695474119501872,
 '284634': 0.00695474119501872,
 '358345': 0.00695474119501872,
 '971891': 0.00695474119501872,
 '646357': 0.00695474119501872,
 '599737': 0.00695474119501872,
 '957529': 0.00695474119501872,
 '970014': 0.00695474119501872,
 '449273': 0.00695474119501872,
 '129997': 0.00695474119501872,
 '244086': 0.00695474119501872,
 '529524': 0.00695474119501872,
 '170675': 0.00695474119501872,
 '126503': 0.00695474119501872,
 '455001': 0.00695474119501872,
 '648917': 0.00695474119501872,
 '255018'

In [10]:
len(w.items())

296775

In [2]:
import findspark, random
findspark.init("/u/cs451/packages/spark")

from pyspark import SparkContext, SparkConf
sc = SparkContext(appName="YourTest", master="local[2]", conf=SparkConf().set('spark.ui.port', random.randrange(4000,5000)))

In [4]:
delta=0.002
w={}
def sgd(all_list):
    for sub_list in all_list:
        t=sub_list[0]
        F=sub_list[1:]
        score = spamminess(F,w)
        prob = 1.0/(1+exp(-score))
        for feature in F:
            if t == 'spam':
                w[feature]=w.get(feature,0)+(1.0-prob)*delta
            elif t == 'ham':
                w[feature]=w.get(feature,0)-prob*delta    
    return w
   
group_x = sc.textFile("/u/cs451/public_html/spam/spam.train.group_x.txt")
weight_x=group_x.map(lambda line: line.split()[1:]).coalesce(1).glom().map(lambda all_list: sgd(all_list)).flatMap(lambda word: word.items())
weight_x.saveAsTextFile("models/group_x_model")

group_y = sc.textFile("/u/cs451/public_html/spam/spam.train.group_y.txt")
weight_y=group_y.map(lambda line: line.split()[1:]).coalesce(1).glom().map(lambda all_list: sgd(all_list)).flatMap(lambda word: word.items())
weight_y.saveAsTextFile("models/group_y_model")

group_b = sc.textFile("/u/cs451/public_html/spam/spam.train.britney.txt")
weight_b=group_b.map(lambda line: line.split()[1:]).coalesce(1).glom().map(lambda all_list: sgd(all_list)).flatMap(lambda word: word.items())
weight_b.saveAsTextFile("models/britney_model")

In [None]:
import random
group_xx= sc.textFile("/u/cs451/public_html/spam/spam.train.group_x.txt")
n=group_xx.count()
#l is random integer in ranger(n)
l=random.sample(range(n),n)
w={}
delta=0.002
def random_key(x):
    a=l[x]
    return a

def sgd(all_list):
    for sub_list in all_list:
        t=sub_list[0]
        F=sub_list[1:]
        score = spamminess(F,w)
        prob = 1.0/(1+exp(-score))
        for feature in F:
            if t == 'spam':
                w[feature]=w.get(feature,0)+(1.0-prob)*delta
            elif t == 'ham':
                w[feature]=w.get(feature,0)-prob*delta            
    return w
   
weight_xx_s=group_xx.map(lambda line: line.split()[1:]).zipWithIndex()\
                  .map(lambda x: (random_key(x[1]),x[0])).sortByKey().map(lambda x: x[1]).coalesce(1) \
                  .glom().map(lambda all_list: sgd(all_list)).flatMap(lambda word: word.items()) \
                  
weight_xx_s.map(lambda x: (x[1],x[0])).sortByKey().top(5)

In [3]:
# Put your model based classifier implementation into this cell
#get weight model for group_x, weight model is dict like {(feature: weight(f)),....}
model_x=sc.textFile("models/group_x_model").map(lambda x: eval(x)).collect()
w_x={}
for x in model_x:
    w_x[x[0]]=x[1]
#get weight model for group_y, weight model is dict like {(feature: weight(f)),....}
model_y=sc.textFile("models/group_y_model").map(lambda x: eval(x)).collect()
w_y={}
for x in model_y:
    w_y[x[0]]=x[1] 
#get weight model for britney, weight model is dict like {(feature: weight(f)),....}   
model_b=sc.textFile("models/britney_model").map(lambda x: eval(x)).collect()
w_b={}
for x in model_b:
    w_b[x[0]]=x[1]
    
#calculate score for each document
def score_x(x):
    score = 0
    for feature in x:
         score += w_x.get(feature,0)
    return score
def score_y(x):
    score = 0
    for feature in x:
         score += w_y.get(feature,0)
    return score
def score_b(x):
    score = 0
    for feature in x:
         score += w_b.get(feature,0)
    return score
#classification
def prediction(s):
    if s > 0:
        t='spam'
    else:
        t='ham'
    return t

test_t=sc.textFile("/u/cs451/public_html/spam/spam.test.qrels.txt")
#use group_x weight model
test1=test_t.map(lambda line: line.split()).map(lambda line: (line[0],line[1],score_x(line[2:]))) \
           .map(lambda line: (line[0],line[1],line[2],prediction(line[2])))
test1.saveAsTextFile("output1")
#use group_y weight model
test2=test_t.map(lambda line: line.split()).map(lambda line: (line[0],line[1],score_y(line[2:]))) \
           .map(lambda line: (line[0],line[1],line[2],prediction(line[2])))
test2.saveAsTextFile("output2")
#use britney weight model
test3=test_t.map(lambda line: line.split()).map(lambda line: (line[0],line[1],score_b(line[2:]))) \
           .map(lambda line: (line[0],line[1],line[2],prediction(line[2])))
test3.saveAsTextFile("output3")

In [5]:
!/u/cs451/bin/spam_eval.sh output1
!/u/cs451/bin/spam_eval.sh output2
!/u/cs451/bin/spam_eval.sh output3

1-ROCA%: 17.26
1-ROCA%: 12.82
1-ROCA%: 15.95
