In [2]:
import itertools
import dateutil
import pickle
import datetime
import locale
import hashlib
import pycountry
import pandas as pd
import numpy as np
import scipy
from scipy.spatial import distance as ssd
from sklearn import model_selection, preprocessing
from collections import defaultdict


In [3]:
df_train = pd.read_csv('/Users/JQC/Desktop/data/train.csv')
df_test = pd.read_csv('/Users/JQC/Desktop/data/test.csv')
# df_store = pd.read_csv('/Users/JQC/Desktop/data/store.csv')

In [4]:
df_train.head()

Unnamed: 0,user,event,invited,timestamp,interested,not_interested
0,3044012,1918771225,0,2012-10-02 15:53:05.754000+00:00,0,0
1,3044012,1502284248,0,2012-10-02 15:53:05.754000+00:00,0,0
2,3044012,2529072432,0,2012-10-02 15:53:05.754000+00:00,1,0
3,3044012,3072478280,0,2012-10-02 15:53:05.754000+00:00,0,0
4,3044012,1390707377,0,2012-10-02 15:53:05.754000+00:00,0,0


In [5]:
df_test.head()

Unnamed: 0,user,event,invited,timestamp
0,1776192,2877501688,0,2012-11-30 11:39:01.230000+00:00
1,1776192,3025444328,0,2012-11-30 11:39:01.230000+00:00
2,1776192,4078218285,0,2012-11-30 11:39:01.230000+00:00
3,1776192,1024025121,0,2012-11-30 11:39:01.230000+00:00
4,1776192,2972428928,0,2012-11-30 11:39:21.985000+00:00


In [6]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15398 entries, 0 to 15397
Data columns (total 6 columns):
user              15398 non-null int64
event             15398 non-null int64
invited           15398 non-null int64
timestamp         15398 non-null object
interested        15398 non-null int64
not_interested    15398 non-null int64
dtypes: int64(5), object(1)
memory usage: 721.9+ KB


In [7]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10237 entries, 0 to 10236
Data columns (total 4 columns):
user         10237 non-null int64
event        10237 non-null int64
invited      10237 non-null int64
timestamp    10237 non-null object
dtypes: int64(3), object(1)
memory usage: 320.0+ KB


In [8]:
class DataCleaner:
  def __init__(self):
    self.localeIdMap = defaultdict(int)
    for index, l in enumerate(locale.locale_alias.keys()):
        self.localeIdMap[l] = index + 1
    self.countryIdMap = defaultdict(int)
    ctryIdx = defaultdict(int)
    for index, country in enumerate(pycountry.countries):
        self.countryIdMap[country.name.lower()] = index + 1
        if country.name.lower() == "usa":
            ctryIdx["US"] = index
        if country.name.lower() == "canada":
            ctryIdx["CA"] = index
    print(ctryIdx['CA'], ctryIdx['US'], len(self.countryIdMap))
    for cc in ctryIdx.keys():
        # 获取美国 加拿大的省份
      for s in pycountry.subdivisions.get(country_code=cc):
        self.countryIdMap[s.name.lower()] = ctryIdx[cc] + 1
    # 载入 gender id 字典
    self.genderIdMap = defaultdict(int, {"male":1, "female":2})
    
  def getLocaleId(self, locstr):
    return self.localeIdMap[locstr.lower()]

  def getGenderId(self, genderStr):
    return self.genderIdMap[genderStr]

  def getJoinedYearMonth(self, dateString):
    dttm = datetime.datetime.strptime(dateString, "%Y-%m-%dT%H:%M:%S.%fZ")
    return "".join([str(dttm.year), str(dttm.month)])

  def getCountryId(self, location):
    if (isinstance(location, str)
        and len(location.strip()) > 0
        and location.rfind("  ") > -1):
      return self.countryIdMap[location[location.rindex("  ") + 2:].lower()]
    else:
      return 0

  def getBirthYearInt(self, birthYear):
    try:
      return 0 if birthYear == "None" else int(birthYear)
    except:
      return 0

  def getTimezoneInt(self, timezone):
    try:
      return int(timezone)
    except:
      return 0
  def getFeatureHash(self, value):
    if len(value.strip()) == 0:
      return -1
    else:
      return int(hashlib.sha224(value).hexdigest()[0:4], 16)

  def getFloatValue(self, value):
    if len(value.strip()) == 0:
      return 0.0
    else:
      return float(value)

In [9]:
dataCleaner = DataCleaner()
len(dataCleaner.countryIdMap)

39 0 249


313

In [10]:
len(pycountry.subdivisions.get(country_code='US')),len(pycountry.subdivisions.get(country_code='CA'))



(57, 13)

In [11]:
pycountry.subdivisions.get(country_code='CN')

{Subdivision(code='CN-11', country_code='CN', name='Beijing', parent_code=None, type='Municipality'),
 Subdivision(code='CN-12', country_code='CN', name='Tianjin', parent_code=None, type='Municipality'),
 Subdivision(code='CN-13', country_code='CN', name='Hebei', parent_code=None, type='Province'),
 Subdivision(code='CN-14', country_code='CN', name='Shanxi', parent_code=None, type='Province'),
 Subdivision(code='CN-15', country_code='CN', name='Nei Mongol', parent_code=None, type='Autonomous region'),
 Subdivision(code='CN-21', country_code='CN', name='Liaoning', parent_code=None, type='Province'),
 Subdivision(code='CN-22', country_code='CN', name='Jilin', parent_code=None, type='Province'),
 Subdivision(code='CN-23', country_code='CN', name='Heilongjiang', parent_code=None, type='Province'),
 Subdivision(code='CN-31', country_code='CN', name='Shanghai', parent_code=None, type='Municipality'),
 Subdivision(code='CN-32', country_code='CN', name='Jiangsu', parent_code=None, type='Provin

In [12]:
[*pycountry.countries]

[Country(alpha_2='AW', alpha_3='ABW', name='Aruba', numeric='533'),
 Country(alpha_2='AF', alpha_3='AFG', name='Afghanistan', numeric='004', official_name='Islamic Republic of Afghanistan'),
 Country(alpha_2='AO', alpha_3='AGO', name='Angola', numeric='024', official_name='Republic of Angola'),
 Country(alpha_2='AI', alpha_3='AIA', name='Anguilla', numeric='660'),
 Country(alpha_2='AX', alpha_3='ALA', name='Åland Islands', numeric='248'),
 Country(alpha_2='AL', alpha_3='ALB', name='Albania', numeric='008', official_name='Republic of Albania'),
 Country(alpha_2='AD', alpha_3='AND', name='Andorra', numeric='020', official_name='Principality of Andorra'),
 Country(alpha_2='AE', alpha_3='ARE', name='United Arab Emirates', numeric='784'),
 Country(alpha_2='AR', alpha_3='ARG', name='Argentina', numeric='032', official_name='Argentine Republic'),
 Country(alpha_2='AM', alpha_3='ARM', name='Armenia', numeric='051', official_name='Republic of Armenia'),
 Country(alpha_2='AS', alpha_3='ASM', nam

## 2.处理user和event关联数据

In [13]:
class ProgramEntities:
    # 因为是事件推荐 我们只关心user和event,重点处理这部分的关联数据
  def __init__(self):
    # 统计训练集中有多少独立的用户和events
    uniqueUsers = set()
    uniqueEvents = set()
    eventsForUser = defaultdict(set)
    usersForEvent = defaultdict(set)
    for filename in ["/Users/JQC/Desktop/data/train.csv", "/Users/JQC/Desktop/data/test.csv"]:
      with open(filename, 'r') as f:
        f.readline().strip().split(",")
        for line in f:
            cols = line.strip().split(",")
            uniqueUsers.add(cols[0])
            uniqueEvents.add(cols[1])
            eventsForUser[cols[0]].add(cols[1])
            usersForEvent[cols[1]].add(cols[0])
#     print(eventsForUser)
#     print(usersForEvent)
    # 将用户和事件转化为稀疏矩阵 行为用户 列为事件
    self.userEventScores = scipy.sparse.dok_matrix((len(uniqueUsers), len(uniqueEvents)))
    self.userIndex = dict()
    self.eventIndex = dict()
    for i, u in enumerate(uniqueUsers):
      self.userIndex[u] = i
    for i, e in enumerate(uniqueEvents):
      self.eventIndex[e] = i
    with open('/Users/JQC/Desktop/data/train.csv', 'r') as f:
      f.readline().strip().split(",")
      for line in f:
          cols = line.strip().split(",")
            # 用户列
          i = self.userIndex[cols[0]]
            # 事件列
          j = self.eventIndex[cols[1]]
            # 感兴趣的列减去不感兴趣的列 为正是感兴趣 为负数为不感兴趣
          self.userEventScores[i, j] = int(cols[4]) - int(cols[5])
    scipy.io.mmwrite("PE_userEventScores", self.userEventScores)
    
    # 找出所有的关联用户和关联事件
    # 关联用户是同一事件中 至少有两个以上的用户 
    # 关联事件 同一个用户至少有两个以上的事件
    self.uniqueUserPairs = set()
    self.uniqueEventPairs = set()
    # 关联用户
    for event in uniqueEvents:
        users = usersForEvent[event]
        if len(users) > 2:
            self.uniqueUserPairs.update(itertools.combinations(users ,2))
    # 关联事件
    for user in uniqueUsers:
        events = eventsForUser[user]
        if len(events) > 2:
            self.uniqueEventPairs.update(itertools.combinations(events ,2))
    pickle.dump(self.userIndex, open("PE_userIndex.pkl", 'wb'))
    pickle.dump(self.eventIndex, open("PE_eventIndex.pkl", 'wb'))


In [14]:
programEntities = ProgramEntities()

In [15]:
programEntities.userEventScores.toarray().shape

(3391, 13418)

In [16]:
programEntities.uniqueEventPairs

{('460925649', '553400622'),
 ('3016291457', '2155134627'),
 ('43133208', '2418172778'),
 ('3841472085', '1610359399'),
 ('2464444877', '3681999765'),
 ('1417903688', '3200757190'),
 ('3033099568', '3339581566'),
 ('2090096046', '422777830'),
 ('2683569441', '1051361796'),
 ('261646692', '3835917768'),
 ('3289683998', '3642792280'),
 ('760916752', '2676714275'),
 ('3507974117', '2061870345'),
 ('746497547', '3464625870'),
 ('1640487353', '2205781579'),
 ('2497272063', '1335651304'),
 ('1500022724', '633659090'),
 ('2409382246', '4223173683'),
 ('622462500', '3847927617'),
 ('1503517875', '3369354257'),
 ('2173876437', '577524735'),
 ('3129233779', '4251687494'),
 ('4179053211', '3904594201'),
 ('924928769', '3697286181'),
 ('3102969984', '1163893685'),
 ('1783539311', '1134576678'),
 ('2075877285', '3102969984'),
 ('466507834', '3200757190'),
 ('3031574388', '1762521462'),
 ('3028432012', '3324867589'),
 ('2442645283', '552686696'),
 ('3063189490', '2825871581'),
 ('2890939858', '38092

## 3.用户与用户相似度矩阵

In [17]:
class Users:
  """
  构建 user/user 相似度矩阵
  """
  def __init__(self, programEntities, sim=ssd.correlation):
    cleaner = DataCleaner()
    nusers = len(programEntities.userIndex.keys())
    with open('/Users/JQC/Desktop/data/users.csv', 'r') as f:
        colnames = f.readline().strip().split(',')
        self.userMatrix = scipy.sparse.dok_matrix((nusers, len(colnames) - 1))
#         ln = 0
        for line in f:
#             ln += 1
#             if ln % 100 == 0:
#                 print('load line ', ln)
#             if ln > 500:
#                 break
            cols = line.strip().split(",")
            if cols[0] in programEntities.userIndex:
                # 索引列
                i = programEntities.userIndex[cols[0]]
                self.userMatrix[i, 0] = cleaner.getLocaleId(cols[1])
                self.userMatrix[i, 1] = cleaner.getBirthYearInt(cols[2])
                self.userMatrix[i, 2] = cleaner.getGenderId(cols[3])
                self.userMatrix[i, 3] = cleaner.getJoinedYearMonth(cols[4])
                self.userMatrix[i, 4] = cleaner.getCountryId(cols[5])
                self.userMatrix[i, 5] = cleaner.getTimezoneInt(cols[6])               
    # l1范式 归一化
    self.userMatrix = preprocessing.normalize(self.userMatrix, norm='l1', axis=0, copy=False)
    # 将转化后的user写到本地
    scipy.io.mmwrite('US_userMatrix', self.userMatrix)
    # 计算用户相似度矩阵
    self.userSimMatrix = scipy.sparse.dok_matrix((nusers, nusers))
    for i in range(nusers):
        self.userSimMatrix[i, i] = 1.0
    for u1, u2 in programEntities.uniqueUserPairs:
        i = programEntities.userIndex[u1]
        j = programEntities.userIndex[u2]
#         if not self.userMatrix.has_key((i, j)):
        if not (i, j) in self.userMatrix:
            usim = sim(self.userMatrix.getrow(i).todense(), self.userMatrix.getrow(j).todense())
            self.userSimMatrix[i, j] = usim
            self.userSimMatrix[j, i] = usim
    scipy.io.mmwrite("US_userSimMatrix", self.userSimMatrix)

In [None]:
users = Users(programEntities)

## 4.用户社交关系挖掘

In [19]:
class UserFriends:
  """
  找出某用户的那些朋友，想法非常简单
  1)如果你有更多的朋友，可能你性格外向，更容易参加各种活动
  2)如果你朋友会参加某个活动，可能你也会跟随去参加一下
  """
  def __init__(self, programEntities):
    nusers = len(programEntities.userIndex.keys())
    self.numFriends = np.zeros((nusers))
    self.userFriends = scipy.sparse.dok_matrix((nusers, nusers))
    
    with open('/Users/JQC/Desktop/data/user_friends.csv', 'r') as f:
        f.readline().strip().split(',')
        ln = 0
        for line in f:
            if ln % 500 == 0:
                print('load line ', ln)
            cols = line.strip().split(",")
            user = cols[0]
            if user in programEntities.userIndex:
                friends = cols[1].split(' ')
                # 找出当前用户的索引
                i = programEntities.userIndex[user]
                # 将当前用户的朋友数量在数组中记录下来
                self.numFriends[i] = len(friends)
                # 遍历当前用户的好友
                for friend in friends:
                    if friend in programEntities.userIndex:
                        j = programEntities.userIndex[friend]
                        eventForUser = programEntities.userEventScores.getrow(j).todense()
                        # 查看朋友对事件的感兴趣度 朋友感兴趣的所有事件/ 所有事件
                        score = eventForUser.sum() / programEntities.userEventScores.shape[1]
                        self.userFriends[i, j] = score
                        self.userFriends[j, i] = score
            ln += 1
            
        # 归一化
        sumNumFriends = self.numFriends.sum()
        self.numFriends = self.numFriends / sumNumFriends
        scipy.io.mmwrite('UF_numFriends', np.mat(self.numFriends))
        self.userFriends = preprocessing.normalize(self.userFriends, norm='l1', axis=0, copy=False)
        scipy.io.mmwrite('UF_userFriends', self.userFriends)


In [20]:
userFriends = UserFriends(programEntities)


load line  0
load line  500
load line  1000
load line  1500
load line  2000
load line  2500
load line  3000
load line  3500
load line  4000
load line  4500


KeyboardInterrupt: 

In [115]:
a.sum(axis=0)

45

In [58]:
(0, 0) in userFriends.userFriends

False

In [54]:
a = scipy.sparse.dok_matrix((3, 10))
a[1, 2] = 2
a[2, 2] = 2
a[2, 3] = 1
a[2, 4] = 3
a, a.keys()

(<3x10 sparse matrix of type '<class 'numpy.float64'>'
 	with 4 stored elements in Dictionary Of Keys format>,
 dict_keys([(1, 2), (2, 2), (2, 3), (2, 4)]))

In [60]:
not (1, 2) in a

False

In [123]:
def cos_sim(vector_a, vector_b):
    """
    计算两个向量之间的余弦相似度
    :param vector_a: 向量 a 
    :param vector_b: 向量 b
    :return: sim
    """
    vector_a = np.mat(vector_a)
    vector_b = np.mat(vector_b)
    num = float(vector_a * vector_b.T)
#     num = float(np.matmul(vector_a, vector_b))
    denom = np.linalg.norm(vector_a) * np.linalg.norm(vector_b)
    print(denom)
    if denom == 0:
        return 0
    cos = num / denom
    sim = 0.5 + 0.5 * cos
    return sim


## 5.构造event和event相似度数据

In [151]:
class Events:
  """
  构建event-event相似度，注意这里有2种相似度：
  1）由用户-event行为，类似协同过滤算出的相似度
  2）由event本身的内容(event信息)计算出的event-event相似度
  """
  def __init__(self, programEntities, psim=ssd.correlation, csim=ssd.cosine):
    cleaner = DataCleaner()
    with open('/Users/JQC/Desktop/data/events.csv', 'r') as f:
        f.readline().strip().split(',')
        nevents = len(programEntities.eventIndex.keys())
        self.eventPropMatrix = scipy.sparse.dok_matrix((nevents, 7))
        self.eventContMatrix = scipy.sparse.dok_matrix((nevents, 100))
        ln = 0
        for line in f:
#             if ln > 100:
#                 break
            cols = line.strip().split(",")
            eventId = cols[0]
            if eventId in programEntities.eventIndex:
                i = programEntities.eventIndex[eventId]
                self.eventPropMatrix[i, 0] = cleaner.getJoinedYearMonth(cols[2]) # start_time
                self.eventPropMatrix[i, 1] = cleaner.getFeatureHash(cols[3].encode('utf8')) # city
                self.eventPropMatrix[i, 2] = cleaner.getFeatureHash(cols[4].encode('utf8')) # state
                self.eventPropMatrix[i, 3] = cleaner.getFeatureHash(cols[5].encode('utf8')) # zip
                self.eventPropMatrix[i, 4] = cleaner.getFeatureHash(cols[6].encode('utf8')) # country
                self.eventPropMatrix[i, 5] = cleaner.getFloatValue(cols[7]) # lat
                self.eventPropMatrix[i, 6] = cleaner.getFloatValue(cols[8]) # lon
                # 第9列开始是c_1, c_2....c_100
                for j in range(9, 109):
                  self.eventContMatrix[i, j-9] = cols[j]
                ln += 1
    # 归一化
    self.eventPropMatrix = preprocessing.normalize(self.eventPropMatrix,
        norm="l1", axis=0, copy=False)
    # 写入本地
    scipy.io.mmwrite("EV_eventPropMatrix", self.eventPropMatrix)
    self.eventContMatrix = preprocessing.normalize(self.eventContMatrix,
        norm="l1", axis=0, copy=False)
    scipy.io.mmwrite("EV_eventContMatrix", self.eventContMatrix)
    # 基于两个矩阵 计算两个事件之间的相似性
    self.eventPropSim = scipy.sparse.dok_matrix((nevents, nevents))
    self.eventContSim = scipy.sparse.dok_matrix((nevents, nevents))
    for e1, e2 in programEntities.uniqueEventPairs:
      i = programEntities.eventIndex[e1]
      j = programEntities.eventIndex[e2]
      if not (i,j) in self.eventPropSim:
#         epsim = psim(self.eventPropMatrix.getrow(i).todense(),
#           self.eventPropMatrix.getrow(j).todense())
#         epsim = 0 if np.isnan(epsim) else ecsim
        overLap = np.nonzero(np.logical_and(self.eventContMatrix.getrow(i).A > 0, \
                                            self.eventContMatrix.getrow(j).A > 0))[1]
        if len(overLap) == 0:
            epsim = 0
        else:
            epsim = csim(self.eventContMatrix.getrow(i).todense(),
          self.eventContMatrix.getrow(j).todense())
        self.eventPropSim[i, j] = epsim
        self.eventPropSim[j, i] = epsim
      if not (i,j) in self.eventContSim:
        overLap = np.nonzero(np.logical_and(self.eventContMatrix.getrow(i).A > 0, \
                                            self.eventContMatrix.getrow(j).A > 0))[1]
        if len(overLap) == 0:
            ecsim = 0
        else:
            ecsim = csim(self.eventContMatrix.getrow(i).todense(),
          self.eventContMatrix.getrow(j).todense())
            
        self.eventContSim[i, j] = epsim
        self.eventContSim[j, i] = epsim
    scipy.io.mmwrite("EV_eventPropSim", self.eventPropSim)
    scipy.io.mmwrite("EV_eventContSim", self.eventContSim)
    


In [152]:
events = Events(programEntities)


39 0 249


## 6.活跃度/event热度 数据

In [138]:
class EventAttendees():
  """
  统计某个活动，参加和不参加的人数，从而为活动活跃度做准备
  """
  def __init__(self, programEntities):
    nevents = len(programEntities.eventIndex.keys())
    self.eventPopularity = scipy.sparse.dok_matrix((nevents, 1))
    with open('/Users/JQC/Desktop/data/event_attendees.csv', 'r') as f:
        f.readline()
        for line in f:
            cols = line.strip().split(',')
            eventId = cols[0]
            if eventId in programEntities.eventIndex:
                i = programEntities.eventIndex[eventId]
                # 参加活动的人数减去没有参加活动的人数
                self.eventPopularity[i, 0] = len(cols[1].split(' ')) - len(cols[4].split(' '))
    self.eventPopularity = preprocessing.normalize(self.eventPopularity, norm='l1',  axis=0, copy=False)
    scipy.io.mmwrite('EA_eventPopularity', self.eventPopularity)

In [139]:
eventAttendees = EventAttendees(programEntities)

In [142]:
eventAttendees.eventPopularity

<13418x1 sparse matrix of type '<class 'numpy.float64'>'
	with 12635 stored elements in Compressed Sparse Column format>

In [2]:
programEntities

NameError: name 'programEntities' is not defined

## 7.串起所有的数据处理和准备流程

In [131]:
def data_prepare():
  """
  计算生成所有的数据，用矩阵或者其他形式存储方便后续提取特征和建模
  """
  print "第1步：统计user和event相关信息..."
  pe = ProgramEntities()
  print "第1步完成...\n"
  print "第2步：计算用户相似度信息，并用矩阵形式存储..."
  Users(pe)
  print "第2步完成...\n"
  print "第3步：计算用户社交关系信息，并存储..."
  UserFriends(pe)
  print "第3步完成...\n"
  print "第4步：计算event相似度信息，并用矩阵形式存储..."
  Events(pe)
  print "第4步完成...\n"
  print "第5步：计算event热度信息..."
  EventAttendees(pe)
  print "第5步完成...\n"

data_prepare()

0.006600732201217152

#### 8.构建特征

In [1]:
class DataRewriter:
  def __init__(self):
    self.userIndex = pickle.load(open("PE_userIndex.pkl", 'rb'))
    self.eventIndex = pickle.load(open('PE_eventIndex.pkl', 'rb'))
    self.userEventScores = scipy.io.mmread('PE_userEventScores').todense()
    self.userSimMatrix = sio.mmread("US_userSimMatrix").todense()
    self.eventPropSim = sio.mmread("EV_eventPropSim").todense()
    self.eventContSim = sio.mmread("EV_eventContSim").todense()
    self.numFriends = sio.mmread("UF_numFriends")
    self.userFriends = sio.mmread("UF_userFriends").todense()
    self.eventPopularity = sio.mmread("EA_eventPopularity").todense()

  def userReco(self, userId, eventId):
    """
    根据User-based协同过滤，得到event的推荐度
    基本的伪代码思路如下：
    """
    i = self.userIndex[userId]
    j = self.eventIndex[eventId]
    # 获取j事件下的score
    vs = self.userEventScores[:, j]
    # 获取改用户与其他所有用户的相似度
    sims = self.userSimMatrix[i, :]
    # 用户的相似度相当于权重 在乘以对应score 就得到了加权后的总的推荐度
    prod = sims * vs
    try:
        # 总的推荐度 需要减去自身的score
      return prod[0, 0] - self.userEventScores[i, j]
    except IndexError:
      return 0

  def eventReco(self, userId, eventId):
    """
    根据基于物品的协同过滤，得到Event的推荐度
    基本的伪代码思路如下：
    """
    i = self.userIndex[userId]
    j = self.eventIndex[eventId]
    js = self.userEventScores[:, j]
    # 拿到该事件和其他事件的相似度 在拿到其他事件的score 就可以计算出该事件的推荐度
    psim = self.eventPropSim[:, j]
    csim = self.eventContSim[:, j]
    pprod = js * psim
    cprod = js * csim
    pscore = 0
    cscore = 0
    try:
      pscore = pprod[0, 0] - self.userEventScores[i, j]
    except IndexError:
      pass
    try:
      cscore = cprod[0, 0] - self.userEventScores[i, j]
    except IndexError:
      pass
    return pscore, cscore  

  def userPop(self, userId):
    """
    基于用户的朋友个数来推断用户的社交程度
    主要的考量是如果用户的朋友非常多，可能会更倾向于参加各种社交活动
    """
    if userId in self.userIndex:
        i = self.userIndex[userId]
        return self.numFriends[0, i]
    else:
        return 0

  def friendInfluence(self, userId):
    """
    朋友对用户的影响
    主要考虑用户所有的朋友中，有多少是非常喜欢参加各种社交活动/event的
    用户的朋友圈如果都积极参与各种event，可能会对当前用户有一定的影响
    """
    nusers = np.shape(self.userFriends)[1]
    i = self.userIndex[userId]
    return (self.userFriends[i, :].sum(axis=0) / nusers)[0,0]

  def eventPop(self, eventId):
    """
    本活动本身的热度
    主要是通过参与的人数来界定的
    """
    i = self.eventIndex[eventId]
    return self.eventPopularity[i, 0]

  def rewriteData(self, start=1, train=True, header=True):
    """
    把前面user-based协同过滤 和 item-based协同过滤，以及各种热度和影响度作为特征组合在一起
    生成新的训练数据，用于分类器分类使用
    """
    filename = '/Users/JQC/Desktop/data/'
    fn = "train.csv" if train else "test.csv"
    fin = open(filename + fn, 'rb')
    fout = open(filename + "data_" + fn, 'wb')
    # write output header
    if header:
      ocolnames = ["invited", "user_reco", "evt_p_reco",
        "evt_c_reco", "user_pop", "frnd_infl", "evt_pop"]
      if train:
        ocolnames.append("interested")
        ocolnames.append("not_interested")
      fout.write(",".join(ocolnames) + "\n")
    ln = 0
    for line in fin:
      ln += 1
      if ln < start:
        continue
      cols = line.strip().split(",")
      userId = cols[0]
      eventId = cols[1]
      invited = cols[2]
      if ln%500 == 0:
          print "%s:%d (userId, eventId)=(%s, %s)" % (fn, ln, userId, eventId)
      user_reco = self.userReco(userId, eventId)
      evt_p_reco, evt_c_reco = self.eventReco(userId, eventId)
      user_pop = self.userPop(userId)
#       frnd_infl = self.friendInfluence(userId)
#       evt_pop = self.eventPop(eventId)
      frnd_infl = self.friendInfluence(userId)
      evt_pop = self.eventPop(eventId)
      ocols = [invited, user_reco, evt_p_reco,
        evt_c_reco, user_pop, frnd_infl, evt_pop]
      if train:
        ocols.append(cols[4]) # interested
        ocols.append(cols[5]) # not_interested
      fout.write(",".join(map(lambda x: str(x), ocols)) + "\n")
    fin.close()
    fout.close()

  def rewriteTrainingSet(self):
    self.rewriteData(True)

  def rewriteTestSet(self):
    self.rewriteData(False)


SyntaxError: invalid syntax (<ipython-input-1-65c25587d071>, line 113)