In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os 
os.chdir('./drive/MyDrive/sharif/Spider/ipython(guide)')

### Libs

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from google.colab import drive
import ast
import re
import pickle


## 1 - Collecting Data


In [4]:
data = pandas.read_csv('../Data/preprocessed.csv')
print(data['body'])

0        ['جزو', 'دولوپرهایی', 'سال', 'کدنویسی', 'کسب',...
1        ['دولوپر\u200cها', 'ابزار\u200cها', 'مختلفی', ...
2        ['می\u200cشناسید', 'جزو', 'توسعه', 'دهندگان', ...
3        ['Microsoft', 'Research', 'Open', 'Data', 'ریپ...
4        ['کامپیوتر\u200cها', 'بزرگی', 'زندگی', 'تشکیل'...
                               ...                        
35662    ['مقاله', 'اتصال', 'کامپوننت\u200cها', 'رویداد...
35663    ['مقالات', 'آموزش', 'مقدماتی', 'جاوا', 'مطلب',...
35664    ['یادگیری', 'تقویتی', 'Reinforcement', 'Learni...
35665    ['نوار', 'ناوبری', 'مؤلفه\u200cها', 'اصلی', 'و...
35666    ['دنیای', 'دنیایی', 'شدهاست', 'اینترنت', 'دستر...
Name: body, Length: 35667, dtype: object


In [5]:
freq_dist = pandas.read_csv('../Data/FreqDist_sorted.csv',index_col=False)
freq_dist.head()

Unnamed: 0,word,freq
0,کار,112636
1,برنامه,65916
2,قرار,61977
3,سایت,51340
4,می\u200cتوانید,40276


#### TF-ICF words

In [6]:
# most of words with frequency of 1, are noises or bad detected
freq_dist_slice = freq_dist.sort_values(by='freq',ascending=False)[freq_dist['freq']>2]
print(len(freq_dist_slice))
freq_dist_slice

97394


  


Unnamed: 0,word,freq
0,کار,112636
1,برنامه,65916
2,قرار,61977
3,سایت,51340
4,می\u200cتوانید,40276
...,...,...
85051,Hope,3
85033,چرامن,3
85050,استایرن,3
85049,imanzakeri,3


In [7]:
# based on tf_icf we may need only 20% of words
'''
tf_icf_words = freq_dist_slice.iloc[:int(len(freq_dist_slice)*.20)]['word'].values
other_words = freq_dist_slice.iloc[int(len(freq_dist_slice)*.20):]['word'].values
print(len(tf_icf_words))
tf_icf_words[:10]
'''

"\ntf_icf_words = freq_dist_slice.iloc[:int(len(freq_dist_slice)*.20)]['word'].values\nother_words = freq_dist_slice.iloc[int(len(freq_dist_slice)*.20):]['word'].values\nprint(len(tf_icf_words))\ntf_icf_words[:10]\n"

## 2 - Extract Features


#### TF-IDF words

In [8]:
max_features = int(freq_dist_slice.shape[0]*0.1)
max_features

9739

In [9]:
vectorizer = TfidfVectorizer(analyzer='word', max_features=max_features)
X = vectorizer.fit_transform(data['body'])
print(type(X))

<class 'scipy.sparse.csr.csr_matrix'>


In [10]:
# to save
pickle.dump(vectorizer,open("../Data/tf_idf.pkl","wb"))

# to load 
# tf = pickle.load(open("tfidf1.pkl", 'rb'))
# tf_new = TfidfVectorizer(analyzer='word', ngram_range=(1,2), stop_words = "english", lowercase = True,
#                           max_features = 500000, vocabulary = tf.vocabulary_)

In [28]:
# feature_names = vectorizer.get_feature_names()
X

<35667x9739 sparse matrix of type '<class 'numpy.float64'>'
	with 5772201 stored elements in Compressed Sparse Row format>

second way

In [12]:
'''
max_features = int(freq_dist_slice.shape[0]*0.002)
max_features
'''

'\nmax_features = int(freq_dist_slice.shape[0]*0.002)\nmax_features\n'

In [13]:
'''
from sklearn.feature_selection import SelectKBest
vectorizer = TfidfVectorizer(analyzer='word')
df_t = vectorizer.fit_transform(data['body'])
'''

"\nfrom sklearn.feature_selection import SelectKBest\nvectorizer = TfidfVectorizer(analyzer='word')\ndf_t = vectorizer.fit_transform(data['body'])\n"

In [14]:
'''
df_t_reduced = SelectKBest(k=max_features).fit_transform(df_t, data['tag'])
max_features
'''

"\ndf_t_reduced = SelectKBest(k=max_features).fit_transform(df_t, data['tag'])\nmax_features\n"

In [15]:
'''
df_t_reduced.shape
'''

'\ndf_t_reduced.shape\n'

#### SVD

In [27]:
svd_features = int(max_features*0.05)
svd_features

486

In [30]:
svd = TruncatedSVD(n_components=svd_features, random_state=42)
features = svd.fit_transform(X)
print(features.shape) 

(35667, 486)


In [31]:
# to save
pickle.dump(svd, open("../Data/svd.p","wb"))

# To load again
# svd = pickle.load(open('../Data/svd.p', 'r'))

## Saving Data

In [32]:
df = pandas.DataFrame(features)
df.to_csv('../Data/features.csv', index=False)
df.shape

(35667, 486)

In [34]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,446,447,448,449,450,451,452,453,454,455,456,457,458,459,460,461,462,463,464,465,466,467,468,469,470,471,472,473,474,475,476,477,478,479,480,481,482,483,484,485
0,0.397011,0.010142,0.035696,-0.048785,0.144684,-0.008509,0.046604,-0.015157,0.020524,-0.082664,-0.014026,-0.080423,0.042881,-0.04766,-0.05529,-0.07542,0.010538,0.040187,-0.098267,-0.005981,-0.122762,0.070221,0.013763,-0.015272,0.023644,-0.035254,-0.02823,-0.032182,0.000251,-0.051448,0.021222,-0.043869,0.069094,0.064546,-0.014225,0.061065,0.018209,-0.038685,0.054074,-0.025347,...,0.022275,-0.024831,-0.018195,-0.005031,0.002362,-0.004617,-0.039723,-0.018533,0.041207,-0.000103,0.008447,-0.030017,-0.009298,-0.01412,-0.008937,0.001638,-0.00186,0.026218,0.016621,0.007905,0.021827,0.008038,0.006823,0.024539,0.001336,-0.002587,-0.031508,-0.000783,-0.011612,-0.008435,-0.0234,0.001998,-0.008012,0.018185,0.009328,-0.003908,0.011458,0.005022,-0.029736,-0.005598
1,0.451446,0.039517,-0.118316,0.000317,0.181427,0.002599,-0.121608,0.019201,0.091485,0.000394,0.001151,-0.145775,0.061216,-0.036319,-0.035141,-0.027668,-0.030637,0.012485,-0.063829,0.000527,0.044708,-0.047579,0.052424,-0.123169,0.133478,0.041541,-0.025883,0.016167,0.012518,-0.142581,0.033072,-0.010612,0.003448,0.081157,0.027328,0.090791,-0.015968,-0.006823,0.027847,-0.029511,...,-0.011924,-0.009498,-0.013241,0.03103,0.019263,-0.003422,-0.017404,-0.035824,-0.003205,0.007776,0.012393,0.00766,0.015475,0.008464,0.006771,-0.005472,0.001841,0.002501,0.019339,-0.007745,0.040592,0.001121,0.01821,0.005311,-0.010508,0.008205,-0.030945,0.018989,-0.021256,0.006052,-0.019965,0.021459,0.000564,0.00964,0.006653,-0.013269,-0.0069,0.020937,-0.013406,-0.006702
2,0.331651,0.129685,-0.045473,0.002326,0.215684,0.05504,-0.01702,-0.009697,0.137331,0.013271,0.05171,-0.151839,-0.010013,0.051206,0.042456,-0.090159,-0.048738,0.0611,-0.177042,-0.034255,0.072493,-0.052387,-0.016069,-0.075256,0.207868,-0.113653,-0.004145,-0.045883,-0.008561,-0.07277,0.038375,-0.061094,0.061367,0.138824,-0.051879,0.047647,-0.044616,-0.020269,0.072478,-0.031684,...,-0.002691,0.008755,-0.02984,0.000766,-0.015376,0.020477,-0.004879,0.022101,-0.004029,-0.006437,0.011723,0.001259,0.001532,-0.002848,0.021177,0.018923,0.012882,0.009535,0.003894,0.006829,0.003183,-0.015817,0.027744,-0.038462,-0.025204,-0.002417,0.010226,-0.010109,-0.009244,-0.00578,0.003224,-0.011244,-0.004257,0.003668,-0.002268,0.003096,0.001129,0.015082,-0.003892,0.010666
3,0.285822,0.014615,-0.027662,0.064,0.062034,-0.022651,-0.144724,-0.015963,0.020738,0.060718,-0.021528,-0.035681,-0.001779,-0.01897,0.016189,-0.038083,-0.027235,0.034763,-0.016258,-0.006819,0.007281,-0.048281,0.024478,0.01351,0.145869,-0.037331,0.012631,-0.000357,0.017089,-0.088744,0.063107,-0.003739,0.103563,0.07757,0.004461,0.000498,-0.069377,-0.019054,0.107282,-0.02452,...,-0.018141,-0.002177,0.001437,-0.008483,0.001295,-0.001225,-0.000922,-0.018537,-0.035982,0.031538,-0.013893,-0.007607,-0.000398,-0.003273,-0.001153,-0.008313,0.004268,0.01089,-0.003277,-0.011998,0.006188,-0.008164,-0.01095,0.014098,0.007882,0.001968,-0.016149,-0.00742,0.020838,-0.020871,-0.000868,-0.019491,-0.006624,-0.01173,-0.006782,0.007055,-0.008513,-0.021847,-0.010223,-0.003442
4,0.36276,0.059624,-0.063547,-0.13043,0.279658,0.084157,0.113535,-0.040843,0.047758,0.04593,-0.022257,-0.013728,0.015986,0.020277,-0.060062,-0.069616,-0.04859,0.035215,-0.083185,-0.0465,0.025037,0.045352,-0.062258,0.096613,0.009655,-0.05126,-0.013012,0.028878,0.013491,-0.002748,-0.003552,-0.044303,0.053882,0.004812,0.030705,0.010485,-0.018536,-0.073784,0.094094,-0.055921,...,-0.005224,0.000588,0.009378,-0.022673,0.009308,0.013077,-0.007959,0.005976,-0.012855,-0.005065,-0.014176,-0.012237,-0.034404,-0.004117,-0.021715,-0.011424,0.028551,-0.012493,-0.014829,-0.005489,-0.031631,0.009435,0.014332,0.010991,-0.016519,-0.001459,-0.006953,-0.004482,-0.000921,-0.015148,-0.005613,-0.01241,0.014771,-0.016763,0.022788,-0.009443,0.012662,-0.010538,0.000604,0.012246
