Poniższe komórki powinny być odpalone zawsze

In [1]:
import cPickle as pickle
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from joblib import Parallel, delayed
import multiprocessing
from sklearn.metrics.pairwise import cosine_similarity as cosine
from sklearn.metrics import jaccard_similarity_score as jaccard
import time

In [2]:
typesinput = {
    'itemID': np.dtype(int),
    'categoryID': np.dtype(int),
    'title': np.dtype(str),
    'description': np.dtype(str),
    'images_array': np.dtype(str),
    'attrsJSON': np.dtype(str),
    'price': np.dtype(float),
    'locationID': np.dtype(int),
    'metroID': np.dtype(float),
    'lat': np.dtype(float),
    'lon': np.dtype(float),
}

types1 = {
        'itemID_1': np.dtype(int),
        'itemID_2': np.dtype(int),
        'isDuplicate': np.dtype(int),
        'generationMethod': np.dtype(int),
}

In [None]:
def compute_similarity_and_save(similarity_function, tfidf_file, pairs, itemstest, output_name):

    tfidfs_desc = pickle.load(open("tfidfs/" + tfidf_file,"r"))
        
    def cosine_sim(itemid1, itemid2):
        return cosine(tfidfs_desc[itemid1], tfidfs_desc[itemid2])

    def jaccard_sim(itemid1, itemid2):
        return jaccard(tfidfs_desc[itemid1], tfidfs_desc[itemid2])

    index_ = pd.Index(itemstest.itemID)
    pairs.itemID_1 = pairs.apply(lambda x: index_.get_loc(x['itemID_1']), axis=1)
    pairs.itemID_2 = pairs.apply(lambda x: index_.get_loc(x['itemID_2']), axis=1)

    if similarity_function=="cosine_sim":
        vecfunc = np.vectorize(cosine_sim)
    elif similarity_function=="jaccard_sim":
        vecfunc = np.vectorize(jaccard_sim)
    res = vecfunc(pairs['itemID_1'], pairs['itemID_2'])
    
    train_simil = pd.Series(res)
    train_simil.to_csv("similarities/" + output_name)

# DESCRIPTION TF-IDFS

Generacja TF-IDF dla opisów

In [None]:

items = pd.read_csv("input/ItemInfo_train.csv", dtype=typesinput, usecols=['description'])
itemstest = pd.read_csv("input/ItemInfo_test.csv", dtype=typesinput, usecols=['description'])
items.fillna("", inplace=True)
itemstest.fillna("", inplace=True)

In [None]:

# Sprawdz tf-idf bez stopwordsow
tfidf = TfidfVectorizer(stop_words = stopwords.words('russian')).fit(pd.concat([items.description, itemstest.description]))

In [None]:
# transf = tfidf.transform(items.description)
pickle.dump(transf, open("tfidfs/description_stopwords_train.data","w"))

In [None]:
transf = tfidf.transform(itemstest.description)
pickle.dump(transf, open("tfidfs/description_stopwords_test.data","w"))

# TITLE TF-IDFS

Generacja TF-IDF dla tytułów

In [None]:

items = pd.read_csv("input/ItemInfo_train.csv", dtype=typesinput, usecols=['title'])
itemstest = pd.read_csv("input/ItemInfo_test.csv", dtype=typesinput, usecols=['title'])
items.fillna("", inplace=True)
itemstest.fillna("", inplace=True)

In [None]:

tfidf = TfidfVectorizer(stop_words = stopwords.words('russian')).fit(pd.concat([items.title, itemstest.title]))
transf = tfidf.transform(items.title)
pickle.dump(transf, open("tfidfs/title_stopwords_train.data","w"))
transf = tfidf.transform(itemstest.title)
pickle.dump(transf, open("tfidfs/title_stopwords_test.data","w"))

In [None]:

tfidf = TfidfVectorizer().fit(pd.concat([items.title, itemstest.title]))
transf = tfidf.transform(items.title)
pickle.dump(transf, open("tfidfs/title_train.data","w"))
transf = tfidf.transform(itemstest.title)
pickle.dump(transf, open("tfidfs/title_test.data","w"))

# IMAGES TF-IDFS

Generacja TF-IDF dla oznaczen obrazkow

In [None]:
images_tfidf_limit = 2000000 # Reszta się będzie mniej więcej powtarzać.

dataset = pd.read_csv('input/labels/labels_merged.csv', usecols=['labels'])
dataset.fillna("", inplace=True)

wordlist = []

for i, row in enumerate(dataset.values):
    if i > 0 and i % 50000 == 0:
        print "step " + str(i)
    if i > 0 and i % images_tfidf_limit == 0:
        break
    for wordp in str(row).split(';'):
        wordf = wordp.split(':')[1]
        for word in wordf.split(','):
            if len(word) == 0:
                continue
            if len(word) >= 2 and word[len(word)-1] == ']':
                word = word[:-2]
            wordlist.append(word.lower())

#print wordlist

print "processed " + str(len(wordlist))
print "vectorizing... "

tfidf = TfidfVectorizer().fit(wordlist)
pickle.dump(tfidf, open('tfidfs/image.data', 'w'))

print "vectorization done!"

In [3]:
# Generacja podobieństwa dla par - zbiór treningowy

print 'wczytywanie...'
pairs = pd.read_csv("input/ItemPairs_train.csv", dtype=types1)
tfidfs = pickle.load(open("tfidfs/image.data","r"))
df = pd.read_csv("input/ItemInfo_train.csv", dtype=typesinput, usecols=['itemID', 'images_array'])
label_df = pd.read_csv('input/labels/labels_merged.csv', usecols=['id', 'labels'])
label_df.fillna("", inplace=True)

def get_labelstring(raw):
    wordlist = []
    for wordp in str(raw).split(';'):
        wordf = wordp.split(':')[1]
        for word in wordf.split(','):
            if len(word) == 0:
                continue
            if len(word) >= 2 and word[len(word)-1] == ']':
                word = word[:-2]
            wordlist.append(word.lower())
    return ','.join(wordlist)

def get_label_string(idn):
    string = ""
    arr1 = df.loc[df['itemID'] == idn]['images_array']
    if "nan" != str(arr1.tolist()[0]):
        for elem in arr1.tolist()[0].split(','):
            label = label_df.loc[label_df['id'] == int(elem)]['labels']
            raw = label.tolist()[0]
            string += get_labelstring(raw)
        return string
    else:
        return ""

def get_cosine(id1, id2):
    stringa = get_label_string(id1)
    stringb = get_label_string(id2)

    veca = tfidfs.transform([stringa])[0]
    vecb = tfidfs.transform([stringb])[0]
    return cosine(veca, vecb)

def processInput(val):
    i, row = val
    id1 = row[0]
    id2 = row[1]
    return get_cosine(id1, id2)

# Łączenie tablic par i list obrazków
df_left = df.rename(columns={'images_array': 'images_array_1'})
df_right = df.rename(columns={'images_array': 'images_array_2'})
pairs_1 = pd.merge(pairs, df_left, how='left', left_on='itemID_1', right_on='itemID', left_index=True)
pairs_12 = pd.merge(pairs_1, df_right, how='left', left_on='itemID_2', right_on='itemID', left_index=True)
print 'wczytane.'

# To jest zdecydowanie za wolne, trzeba jakoś przyśpieszyć
print 'przetwarzanie...'
inputs = enumerate(pairs_12.head(1000).values)
num_cores = multiprocessing.cpu_count()-1
start = time.time()
results = Parallel(n_jobs=num_cores)(delayed(processInput)(i) for i in inputs)
end = time.time()

print(end - start)
#results


wczytywanie...
wczytane.
przetwarzanie...
43.2824878693


# COSINE 

Liczenie podobieństwa cosinusowego

In [None]:

pairs = pd.read_csv("input/ItemPairs_test.csv", dtype=types1)
tfidfs_desc = pickle.load(open("tfidfs/description_stopwords_test.data","r"))
itemstest = pd.read_csv("input/ItemInfo_test.csv", dtype=typesinput, usecols=['itemID'])

In [None]:
index_ = pd.Index(itemstest.itemID)
pairs.itemID_1 = pairs.apply(lambda x: index_.get_loc(x['itemID_1']), axis=1)
pairs.itemID_2 = pairs.apply(lambda x: index_.get_loc(x['itemID_2']), axis=1)

In [None]:
test_simil = pd.Series(res)

In [None]:
test_simil.to_csv("similarities/test_description_stopwords_cosine.csv")

In [None]:
pairs = pd.read_csv("input/ItemPairs_train.csv", dtype=types1)
tfidfs_desc = pickle.load(open("tfidfs/description_stopwords_train.data","r"))
itemstest = pd.read_csv("input/ItemInfo_train.csv", dtype=typesinput, usecols=['itemID'])

In [None]:
index_ = pd.Index(itemstest.itemID)
pairs.itemID_1 = pairs.apply(lambda x: index_.get_loc(x['itemID_1']), axis=1)
pairs.itemID_2 = pairs.apply(lambda x: index_.get_loc(x['itemID_2']), axis=1)

vecfunc = np.vectorize(cosine_sim)
res = vecfunc(pairs['itemID_1'], pairs['itemID_2'])

In [None]:
train_simil = pd.Series(res)

In [None]:
train_simil.to_csv("similarities/train_description_stopwords_cosine.csv")

In [None]:
%matplotlib inline

# COSINE TITLE

In [None]:

pairs = pd.read_csv("input/ItemPairs_test.csv", dtype=types1)
tfidfs_desc = pickle.load(open("tfidfs/title_stopwords_train.data","r"))
itemstest = pd.read_csv("input/ItemInfo_test.csv", dtype=typesinput, usecols=['itemID'])

In [None]:
index_ = pd.Index(itemstest.itemID)
pairs.itemID_1 = pairs.apply(lambda x: index_.get_loc(x['itemID_1']), axis=1)
pairs.itemID_2 = pairs.apply(lambda x: index_.get_loc(x['itemID_2']), axis=1)

# Da się zrównoleglić

vecfunc = np.vectorize(cosine_sim)
res = vecfunc(pairs['itemID_1'], pairs['itemID_2'])

In [None]:
test_simil = pd.Series(res)

In [None]:
test_simil.to_csv("similarities/test_title_stopwords_cosine.csv")

In [None]:
pairs = pd.read_csv("input/ItemPairs_train.csv", dtype=types1)
itemstest = pd.read_csv("input/ItemInfo_train.csv", dtype=typesinput, usecols=['itemID'])
# compute_similarity_and_save("cosine_sim", "title_stopwords_train.data", pairs, itemstest, "train_title_stopwords_cosine.csv")

In [None]:
compute_similarity_and_save("cosine_sim", "title_train.data", pairs, itemstest, "train_title_cosine.csv")

In [None]:
compute_similarity_and_save("jaccard_sim", "title_train.data", pairs, itemstest, "train_title_jaccard.csv")

In [None]:
compute_similarity_and_save("jaccard_sim", "title_stopwords_train.data", pairs, itemstest, "train_title_stopwords_jaccard.csv")

In [None]:
pairs = pd.read_csv("input/ItemPairs_test.csv", dtype=types1)
itemstest = pd.read_csv("input/ItemInfo_test.csv", dtype=typesinput, usecols=['itemID'])
compute_similarity_and_save("cosine_sim", "title_stopwords_test.data", pairs, itemstest, "test_title_stopwords_cosine.csv")

In [None]:
pairs = pd.read_csv("input/ItemPairs_test.csv", dtype=types1)
itemstest = pd.read_csv("input/ItemInfo_test.csv", dtype=typesinput, usecols=['itemID'])
compute_similarity_and_save("cosine_sim", "title_test.data", pairs, itemstest, "test_title_cosine.csv")

In [None]:
pairs = pd.read_csv("input/ItemPairs_test.csv", dtype=types1)
itemstest = pd.read_csv("input/ItemInfo_test.csv", dtype=typesinput, usecols=['itemID'])
compute_similarity_and_save("jaccard_sim", "title_test.data", pairs, itemstest, "test_title_jaccard.csv")

In [None]:
pairs = pd.read_csv("input/ItemPairs_test.csv", dtype=types1)
compute_similarity_and_save("jaccard_sim", "title_stopwords_test.data", pairs, itemstest, "tessttitle_stopwords_jaccard.csv")