In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
import itertools
import operator
from collections import Counter
from nltk import pos_tag
from textblob import TextBlob
from nltk.corpus import stopwords
from sklearn.cross_validation import train_test_split
from scipy import sparse as sp_sparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from collections import defaultdict


from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score 
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix



In [2]:
tags = pd.read_csv('./Tags.csv')

In [3]:
x = tags['Tag'].value_counts()

In [4]:
x.head(10)

javascript    124155
java          115212
c#            101186
php            98808
android        90659
jquery         78542
python         64601
html           58976
c++            47591
ios            47009
Name: Tag, dtype: int64

In [5]:
x = pd.DataFrame(x[x>35000])

In [6]:
x = x.reset_index()

In [7]:
valid_ids = tags[tags['Tag'].isin(x['index'])]

In [8]:
len(valid_ids.Id)

947293

In [9]:
len(valid_ids.Id.unique())

759996

In [10]:
#Join all tags grouped by the question ID

valid_ids = valid_ids.groupby('Id').agg(lambda x: " ".join(x)).reset_index()
valid_ids['Tag'] = valid_ids['Tag'].apply(lambda x: x.split(" "))

In [11]:
questions = pd.read_csv('./Questions.csv', encoding = 'latin-1')

In [12]:
ques_tags = pd.merge(left = questions, right = valid_ids, how = 'right', on = 'Id')

In [13]:
len(ques_tags)

759996

In [14]:
ques_tags = ques_tags[ques_tags.Score > 0]

In [15]:
ques_tags.to_pickle('top10')

In [16]:
# Add necessary stopwords

stopwords1 =  set(stopwords.words('english'))
stopwords1.add("I'm")
stopwords1.add("I've")
stopwords1.add('I')
stopwords1

{'I',
 "I'm",
 "I've",
 'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out

In [17]:
def cleanhtml(raw_html):
    '''
    Function to clean the data of hyperlinks and other text
    '''
    cleanr = re.compile('<.*?> ')
    cleanr1 = re.compile('<.*?>')
    cleantext1 = re.sub(cleanr, '', raw_html)
    cleantext = re.sub(cleanr1, '', cleantext1)
    return(cleantext)

In [18]:
def integer_frequency(list_str):
    '''
    Function to clean out line breaks and other not required text
    '''
    list_str = re.sub(r'^.*?https?:\/\/.*[\r\n]*', '', list_str, flags=re.MULTILINE)
    list_str = re.sub(r'^https?:\/\/.*[\r\n]*', '', list_str, flags=re.MULTILINE)
    list_str = list_str.replace('.', ' ')
    list_str = list_str.replace(',', ' ')
    list_str = list_str.replace('\n', ' ')
    list_str = list_str.replace('\r', ' ')
    list_str = list_str.replace('?', '')
    ls = [x for x in list_str.split(' ') if x not in stopwords1]
    return " ".join(ls)

In [19]:
#Clean the whole dataset and combine Question Title & Body

ques_tags['Body'] = ques_tags['Body'].str.replace('<p>', '')
ques_tags['Body'] = ques_tags['Body'].str.replace('</p>', '')
ques_tags['Body'] = ques_tags['Body'].apply(lambda s: cleanhtml(s))
ques_tags['Body'] = ques_tags['Body'].apply(lambda s: integer_frequency(s))
ques_tags['ques'] = ques_tags['Title'] + ' ' + ques_tags['Body']
ques_tags= ques_tags[['Id', 'ques','Tag']]


In [20]:
len(ques_tags)

349136

# Modelling

In [21]:
X = ques_tags['ques'].values
y = ques_tags['Tag'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

In [22]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def text_prepare(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower()
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replaces REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # deletes symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # deletes stopwors from text
    return text

In [23]:
X_train = [text_prepare(x) for x in X_train]
X_val = [text_prepare(x) for x in X_val]
X_test = [text_prepare(x) for x in X_test]

In [24]:
X_train[:3]

['responsive web svg visualization building interactive svg visualization see image would like provide 2 decent fallback javascript disabled modernizr think better work fixed discrete pixel widths svg block adapt available width need window resize listener dynamically update layout overkill using media queries regarding 2 keeping design responsive would good way replacing svg visualization static images',
 'exiting application messagebox call constructor want show dialog message box actual form user selects application completly closed trying use code even clicking form shown public form1 initializecomponent messagebox show contiue question messageboxbuttons yesno messageboxicon none messageboxdefaultbutton button1 dialogresult application exit also tried exepction application run problem idea would best approach',
 'play audio file recorded via media recorder browser using android media recorder record audio code works fine audio getting recorded played successfully device setting out

In [25]:
from collections import defaultdict
# Dictionary of all tags from train corpus with their counts.
tags_counts = defaultdict(int)
# Dictionary of all words from train corpus with their counts.
words_counts = defaultdict(int)

for tags in y_train:
    for tag in tags:
        tags_counts[tag] += 1

for tags in y_val:
    for tag in tags:
        if tag not in tags_counts:
            tags_counts[tag] += 1

for text in X_train:
    for word in text.split():
        words_counts[word] += 1

In [26]:
most_common_tags = sorted(tags_counts.items(), key=lambda x: x[1], reverse=True)[:3]
most_common_words = sorted(words_counts.items(), key=lambda x: x[1], reverse=True)[:3]

In [27]:
DICT_SIZE = 5000
INDEX_TO_WORDS = sorted(words_counts.keys(), key=lambda x: words_counts[x], reverse=True)[:DICT_SIZE]
WORDS_TO_INDEX = {word:i for i, word in enumerate(INDEX_TO_WORDS)}
ALL_WORDS = WORDS_TO_INDEX.keys()

def my_bag_of_words(text, words_to_index, dict_size):
    """
        text: a string
        dict_size: size of the dictionary
        
        return a vector which is a bag-of-words representation of 'text'
    """
    text = text.split()
    result_vector = np.zeros(dict_size)
    for word in text:
        if word in words_to_index:
            result_vector[words_to_index[word]]+=1
    
    return result_vector

In [28]:
X_train_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_train])
X_val_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_val])
X_test_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_test])
print('X_train shape ', X_train_mybag.shape)
print('X_val shape ', X_val_mybag.shape)
print('X_test shape ', X_test_mybag.shape)

X_train shape  (223446, 5000)
X_val shape  (55862, 5000)
X_test shape  (69828, 5000)


In [29]:
def tfidf_features(X_train, X_val, X_test):
    """
        X_train, X_val, X_test — samples        
        return TF-IDF vectorized representation of each sample and vocabulary
    """        
    tfidf_vectorizer = TfidfVectorizer(min_df=5, max_df=0.9, ngram_range=(1, 2), token_pattern='(\S+)')
    X_train = tfidf_vectorizer.fit_transform(X_train)
    X_val = tfidf_vectorizer.transform(X_val)
    X_test = tfidf_vectorizer.transform(X_test)
    
    return X_train, X_val, X_test, tfidf_vectorizer.vocabulary_
X_train_tfidf, X_val_tfidf, X_test_tfidf, tfidf_vocab = tfidf_features(X_train, X_val, X_test)
tfidf_reversed_vocab = {i:word for word,i in tfidf_vocab.items()}

In [30]:
sorted(tfidf_vocab)[:10]

['#', '# #', '# ##', '# ###', '# ##0', '# #gt', '# +', '# 0', '# 09', '# 1']

In [32]:
mlb = MultiLabelBinarizer(classes=sorted(tags_counts.keys()))
mlb1 = MultiLabelBinarizer(classes=sorted(tags_counts.keys()))
#y_train = mlb.fit_transform(y_train)

#y_val = mlb.fit_transform(y_val)

In [33]:
y_train_1 = mlb.fit_transform(y_train)

In [34]:
y_val_1 = mlb1.fit_transform(y_val)

In [35]:
#tried several classifiers and picked the best one.
def train_classifier(X_train, y_train):
    """
      X_train, y_train — training data
      
      return: trained classifier
    """
    
#     clf = OneVsRestClassifier(MLPClassifier(hidden_layer_sizes=(16,8,8), learning_rate='adaptive', max_iter=2000,verbose=True))
#     clf = OneVsRestClassifier(SVC(max_iter=1000, verbose=True))
#     clf = OneVsRestClassifier(LogisticRegression())
#    clf = OneVsRestClassifier(AdaBoostClassifier())
    clf = OneVsRestClassifier(RidgeClassifier(normalize=True))
    clf.fit(X_train, y_train)
    return clf   

In [38]:
#classifier_mybag = train_classifier(X_train_mybag, y_train_1)
classifier_tfidf = train_classifier(X_train_tfidf, y_train_1)

In [40]:
import pickle
filename = 'bow_model~.sav'
filename2 = 'tfidf_model.sav'
#pickle.dump(classifier_mybag, open(filename, 'wb'))
pickle.dump(classifier_tfidf, open(filename, 'wb'))


In [41]:
#y_val_predicted_labels_mybag = classifier_mybag.predict(X_val_mybag)
#y_val_predicted_scores_mybag = classifier_mybag.decision_function(X_val_mybag)

y_val_predicted_labels_tfidf = classifier_tfidf.predict(X_val_tfidf)
y_val_predicted_scores_tfidf = classifier_tfidf.decision_function(X_val_tfidf)

In [42]:
y_val_pred_inversed = mlb.inverse_transform(y_val_predicted_labels_tfidf)
y_val_inversed = mlb.inverse_transform(y_val_1)
for i in range(1,120):
    print('Title:\t{}\nTrue labels:\t{}\nPredicted labels:\t{}\n\n'.format(
        X_val[i],
        ','.join(y_val_inversed[i]),
        ','.join(y_val_pred_inversed[i])
    ))

Title:	insert line breaks text file im creating sd card code pretty standard sreport contains text would like output text file contains several n line breaks render fine textview text file theyre nonexistent must something easy im missing try fileoutputstream fout new fileoutputstream sdir + sfile write fout write sreport getbytes close output stream fout flush fout close catch ioexception e e printstacktrace
True labels:	android
Predicted labels:	


Title:	devexpress treelist adding objects show object name trying add objects treelist devexpress sadly fill cells automatically instead displays every cell object name public class tempitem private m_name string private m_value string public property name string get return m_name end get set value string m_name value end set end property public property value string get return m_value end get set value string m_value value end set end property end class private function tempitem1 list tempitem dim tmplist new list tempitem tmplist add new

In [None]:
def print_evaluation_scores(y_val, predicted):
    print(accuracy_score(y_val, predicted))
    print(recall_score(y_val,predicted, average='weighted'))
    print(f1_score(y_val, predicted, average='weighted'))
    print(average_precision_score(y_val, predicted))
    

In [None]:
print_evaluation_scores(y_val_1,y_val_predicted_labels_tfidf)