In [1]:
import numpy as np
import pandas as pd
data = pd.read_excel('test_assignment.xlsx')

In [2]:
data = data.dropna(axis=0)

In [3]:
data.shape

(6731, 2)

In [4]:
data.drop_duplicates(subset ="String", inplace=True)

In [5]:
data.nunique()

String    6730
Tag       1080
dtype: int64

In [6]:
data.head()

Unnamed: 0,String,Tag
0,"3GPP Technical Specification Group, ""Spatial c...",1.0
1,"3GPP Technical Specification Group, ""Spatial c...",1.0
2,"3GPP Technical Specification Group, ""Spatial c...",1.0
3,"3GPP Technical Specification Group, ""Spatial c...",1.0
4,"3GPP TR 25.876 V7.0.0 (2007-03), Technical Rep...",2.0


In [7]:
import logging
import datefinder
from datetime import datetime
from numpy import random
import gensim
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from textblob import Word
import re
%matplotlib inline

unable to import 'smart_open.gcs', disabling that module


In [8]:
def print_plot(index):
    example = data[data.index == index][['String', 'Tag']].values[0]
    print(example[0])
    print('Tag:', example[1])
print_plot(60)

3GPP, "LTE", pages 1-63, April 2010 [online], Retrieved from the Internet:< URL:http://www.3gpp.org/LTE>.
Tag: 8.0


In [9]:
REPLACE_BY_SPACE_RE = re.compile('[(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z ]')
STOPWORDS = set(stopwords.words('english'))
month_dict = dict(jan='01', feb='02', mar='03', apr='04', may='05', jun='06', jul='07', aug='08', sep='09',
                  oct='10', nov='11', dec='12')
def word_to_num(string):
    #This function converts a string to lowercase and only accepts the first three letter.
    s = string.lower()[:3]
    return month_dict[s]

def date_converter(text):
    #This function extracts dates in every format from text and converts them to YYYYMMDD.
    results = []
    day = '01'
    month = '01'
    year = '1900'
    regex = re.search('([0]?\d|[1][0-2])[/-]([0-3]?\d)[/-]([1-2]\d{3}|\d{2})', text)
    # If format is DD Month YYYY or D Mon YY or some combination, also matches if no day given
    month_regex = re.search(
        '([0-3]?\d)\s*(Jan(?:uary)?(?:aury)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|June?|July?|Aug('
        '?:ust)?|Sept?(?:ember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?(?:emeber)?).?,?\s([1-2]\d{3})',
        text)
    # If format is Month/DD/YYYY or Mon/D/YY or or Month DDth, YYYY or some combination
    rev_month_regex = re.search(
        '(Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|June?|July?|Aug(?:ust)?|Sept?(?:ember)?|Oct('
        '?:ober)?|Nov(?:ember)?|Dec(?:ember)?).?[-\s]([0-3]?\d)(?:st|nd|rd|th)?[-,\s]\s*([1-2]\d{3})',
        text)
    # If format is any combination of just Month or Mon and YY or YYYY
    no_day_regex = re.search(
        '(Jan(?:uary)?(?:aury)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|June?|July?|Aug(?:ust)?|Sept?('
        '?:ember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?(?:emeber)?).?,?[\s]([1-2]\d{3}|\d{2})',
        text)
    # If format is MM/YYYY or M YYYY or some combination
    no_day_digits_regex = re.search('([0]?\d|[1][0-2])[/\s]([1-2]\d{3})', text)
    # If format only contains a year. If year is written alone it must be in form YYYY
    year_only_regex = re.search('([1-2]\d{3})', text)
    if regex:
        day = regex.group(2)
        month = regex.group(1)
        year = regex.group(3)
    elif month_regex:
        day = month_regex.group(1)
        month = word_to_num(month_regex.group(2))
        year = month_regex.group(3)
    elif rev_month_regex:
        day = rev_month_regex.group(2)
        month = word_to_num(rev_month_regex.group(1))
        year = rev_month_regex.group(3)
    elif no_day_regex:
        month = word_to_num(no_day_regex.group(1))
        year = no_day_regex.group(2)
    elif no_day_digits_regex:
        month = no_day_digits_regex.group(1)
        year = no_day_digits_regex.group(2)
    elif year_only_regex:
        year = year_only_regex.group(0)
    # Make sure all variables have correct number, add zeros if necessary
    month = month.zfill(2)
    day = day.zfill(2)
    if day == '00':
        day = '01'
    if year is not None and len(year) == 2:
        year = '19' + year
    results.append(year+'-'+month+'-'+day)
    return results

def unique_list(l):
    ulist = []
    [ulist.append(x) for x in l if x not in ulist]
    return ulist

def clean_text(text):        
    text = BeautifulSoup(text, "lxml").text # HTML decoding
    dt = date_converter(text)
    text = text.lower() # lowercase text
    text = re.sub('retrieved from the internet', '', text)
    text = re.sub('online', '', text)
    text = re.sub('url', '', text)
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    te = re.findall(r'"(.*?)"', text)
    te = ''.join(te)
    te =' '.join(unique_list(te.split()))
    #te = ''.join(te.split())
    text = re.sub('- ', '', text)
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    te = BAD_SYMBOLS_RE.sub('', te)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    text =' '.join(unique_list(text.split()))
    text = text+' '+te
    dt = ''.join([str(elem) for elem in dt])
    text = text+' '+str(dt)
    return text
   
data['Conv_String'] = data['String'].apply(clean_text)
data['Conv_String'] = data['Conv_String'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

In [47]:
print(data['String'][35])

3GPP TR 25.913, "Requirements for Evolved UTRA (E-UTRA) and Evolved UTRAN (E-UTRAN)", V8.0.0 (Jan. 2009).


In [48]:
print(data['Conv_String'][35])

3gpp tr 25913 requirement evolved utra eutra utran eutran v800 jan 2009 requirement for evolved utra eutra and utran eutran 2009-01-01


In [12]:
data = pd.DataFrame(data, columns=['String','Conv_String','Tag'])

In [13]:
data.head()

Unnamed: 0,String,Conv_String,Tag
0,"3GPP Technical Specification Group, ""Spatial c...",3gpp technical specification group spatial cha...,1.0
1,"3GPP Technical Specification Group, ""Spatial c...",3gpp technical specification group spatial cha...,1.0
2,"3GPP Technical Specification Group, ""Spatial c...",3gpp technical specification group spatial cha...,1.0
3,"3GPP Technical Specification Group, ""Spatial c...",3gpp technical specification group spatial cha...,1.0
4,"3GPP TR 25.876 V7.0.0 (2007-03), Technical Rep...",3gpp tr 25876 v700 200703 technical report 3rd...,2.0


In [14]:
data['Conv_String'].apply(lambda x: len(x.split(' '))).sum()

152367

In [15]:
x = data.Conv_String

In [16]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

In [17]:
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')

In [18]:
x_count = count_vect.fit_transform(x)

In [19]:
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)

In [20]:
x_tfidf = tfidf_vect.fit_transform(x)

In [21]:
x_tfidf.shape

(6730, 3995)

In [22]:
x_count.shape

(6730, 3995)

In [23]:
from sklearn.cluster import KMeans
from sklearn import metrics
from scipy.spatial.distance import cdist
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score
from matplotlib import style
from sklearn.datasets.samples_generator import make_blobs

In [24]:
m = data.Conv_String.shape[0]
m

6730

In [25]:
c=m//3
d=m//6
ls1 = [*range(c-40,c+80)]
ls2 = [*range(d-50,d+50)]
ls = list(set(ls1+ls2))
print(ls)
print(len(ls))

[1071, 1072, 1073, 1074, 1075, 1076, 1077, 1078, 1079, 1080, 1081, 1082, 1083, 1084, 1085, 1086, 1087, 1088, 1089, 1090, 1091, 1092, 1093, 1094, 1095, 1096, 1097, 1098, 1099, 1100, 1101, 1102, 1103, 1104, 1105, 1106, 1107, 1108, 1109, 1110, 1111, 1112, 1113, 1114, 1115, 1116, 1117, 1118, 1119, 1120, 1121, 1122, 1123, 1124, 1125, 1126, 1127, 1128, 1129, 1130, 1131, 1132, 1133, 1134, 1135, 1136, 1137, 1138, 1139, 1140, 1141, 1142, 1143, 1144, 1145, 1146, 1147, 1148, 1149, 1150, 1151, 1152, 1153, 1154, 1155, 1156, 1157, 1158, 1159, 1160, 1161, 1162, 1163, 1164, 1165, 1166, 1167, 1168, 1169, 1170, 2203, 2204, 2205, 2206, 2207, 2208, 2209, 2210, 2211, 2212, 2213, 2214, 2215, 2216, 2217, 2218, 2219, 2220, 2221, 2222, 2223, 2224, 2225, 2226, 2227, 2228, 2229, 2230, 2231, 2232, 2233, 2234, 2235, 2236, 2237, 2238, 2239, 2240, 2241, 2242, 2243, 2244, 2245, 2246, 2247, 2248, 2249, 2250, 2251, 2252, 2253, 2254, 2255, 2256, 2257, 2258, 2259, 2260, 2261, 2262, 2263, 2264, 2265, 2266, 2267, 2268, 226

In [26]:
ls = [1079, 1080, 1081, 1082, 1083, 1084, 1085, 1086, 1087]

In [27]:
'''
best_clusters = 0                       
previous_silh_avg = 0.0

for n_clusters in ls:
    clusterer = KMeans(n_clusters=n_clusters,init='k-means++', n_jobs=-2)
    cluster_labels = clusterer.fit_predict(x_tfidf)
    silhouette_avg = silhouette_score(x_tfidf, cluster_labels)
    if silhouette_avg > previous_silh_avg:
        previous_silh_avg = silhouette_avg
        best_clusters = n_clusters

# Final Kmeans for best_clusters
print(best_clusters)
'''

"\nbest_clusters = 0                       \nprevious_silh_avg = 0.0\n\nfor n_clusters in ls:\n    clusterer = KMeans(n_clusters=n_clusters,init='k-means++', n_jobs=-2)\n    cluster_labels = clusterer.fit_predict(x_tfidf)\n    silhouette_avg = silhouette_score(x_tfidf, cluster_labels)\n    if silhouette_avg > previous_silh_avg:\n        previous_silh_avg = silhouette_avg\n        best_clusters = n_clusters\n\n# Final Kmeans for best_clusters\nprint(best_clusters)\n"

In [28]:
best_clusters=1080

In [29]:
model = KMeans(n_clusters=best_clusters, 
               init='k-means++', 
               max_iter=100, # Maximum number of iterations of the k-means algorithm for a single run.
               n_init=1)

In [30]:
model.fit(x_tfidf)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
       n_clusters=1080, n_init=1, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [31]:
labels = model.labels_

In [32]:
print(labels)

[970 970 970 ... 811 811 811]


In [33]:
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = tfidf_vect.get_feature_names()

In [34]:
test = data.Conv_String[28]
print(test)

3gpp tr 25913 requirement evolved utra eutra utran eutran v800 jan 2009 pp 120 requirement for evolved utra eutra and utran eutran 2009-01-01


In [35]:
test = data.Conv_String[1]
print(test)

3gpp technical specification group spatial channel model scm134 text v60 ahg combined adhoc 3gpp2 april 2003 pp 145 spatial channel model scm134 text v60 2003-04-01


In [36]:
x_t = tfidf_vect.transform([test])

cluster = model.predict(x_t)[0]

print("Text belongs to cluster number {0}".format(cluster))

Text belongs to cluster number 970


In [37]:
from sklearn.model_selection import train_test_split

In [38]:
x_train, x_test, y_train, y_test = train_test_split(x_tfidf, labels, test_size=0.2, random_state=45)

In [39]:
from sklearn.linear_model import LogisticRegression

In [40]:
model_lr = LogisticRegression()

In [41]:
model_lr.fit(x_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [42]:
y_pred = model_lr.predict(x_test)

In [43]:
print(accuracy_score(y_test, y_pred))

0.7919762258543833


In [44]:
data['Cluster'] = labels

In [45]:
data.head()

Unnamed: 0,String,Conv_String,Tag,Cluster
0,"3GPP Technical Specification Group, ""Spatial c...",3gpp technical specification group spatial cha...,1.0,970
1,"3GPP Technical Specification Group, ""Spatial c...",3gpp technical specification group spatial cha...,1.0,970
2,"3GPP Technical Specification Group, ""Spatial c...",3gpp technical specification group spatial cha...,1.0,970
3,"3GPP Technical Specification Group, ""Spatial c...",3gpp technical specification group spatial cha...,1.0,970
4,"3GPP TR 25.876 V7.0.0 (2007-03), Technical Rep...",3gpp tr 25876 v700 200703 technical report 3rd...,2.0,415


In [46]:
data.to_excel('data_clustering_3.xlsx')

In [None]:
style.use("fivethirtyeight") 

cost =[] 
for i in range(1077, 1082):
    KM = KMeans(n_clusters = i, max_iter = 500)
    KM.fit(x_count)
    cost.append(KM.inertia_)

# plot the cost against K values 
plt.plot(range(1077, 1082), cost, color ='g', linewidth ='3') 
plt.xlabel("Value of K") 
plt.ylabel("Sqaured Error (Cost)") 
plt.show() # clear the plot 

# the point of the elbow is the 
# most optimal value for choosing k 

In [None]:
def calculate_WSS(points, kmax):
  sse = []
  for k in range(1, kmax+1):
    kmeans = KMeans(n_clusters = k).fit(points)
    centroids = kmeans.cluster_centers_
    pred_clusters = kmeans.predict(points)
    curr_sse = 0
    
    # calculate square of Euclidean distance of each point from its cluster center and add to current WSS
    for i in range(len(points)):
      curr_center = centroids[pred_clusters[i]]
      curr_sse += (points[i, 0] - curr_center[0]) ** 2 + (points[i, 1] - curr_center[1]) ** 2
      
    sse.append(curr_sse)
  return sse

In [None]:
distortions = []
K = range(1,10)
for k in K:
    kmeanModel = KMeans(n_clusters=k).fit(messages)
    kmeanModel.fit(messages)
    distortions.append(sum(np.min(cdist(X, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0])

# Plot the elbow
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

In [None]:
for i in range(m):
    

In [None]:
best = KMeans(points)
for t in range

In [None]:
model = KMeans(n_clusters=4, init='k-means++', max_iter=100, n_init=1)
model.fit(x)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test = train_test_split(x, test_size=0.2, random_state=1)

In [None]:
pred = model.predict(x_test)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
string1 = "calvin klein design dress calvin klein"
string2 = ' '.join(set(string1.split()))
print(string2)

In [None]:
REPLACE_BY_SPACE_RE = re.compile('[(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z ]')
STOPWORDS = set(stopwords.words('english'))
def unique_list(l):
    ulist = []
    [ulist.append(x) for x in l if x not in ulist]
    return ulist

def clean_text(text):
    text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    te = re.findall(r'"(.*?)"', text)
    te = ''.join(te)
    te =' '.join(unique_list(te.split()))
    te = ''.join(te.split())
    text = re.sub('- ', '', text)
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    te = BAD_SYMBOLS_RE.sub('', te)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    text =' '.join(unique_list(text.split()))
    text = text+' '+te
    return text
    
data['Conv_String'] = data['String'].apply(clean_text)

In [None]:
lookup_dict = {'rt':'Retweet', 'dm':'direct message', "awsm" : "awesome", "luv" :"love", "..."}
def _lookup_words(input_text):
    words = input_text.split() 
    new_words = [] 
    for word in words:
        if word.lower() in lookup_dict:
            word = lookup_dict[word.lower()]
        new_words.append(word) new_text = " ".join(new_words) 
        return new_text

_lookup_words("RT this is a retweeted tweet by Shivam Bansal")
>> "Retweet this is a retweeted tweet by Shivam Bansal"

In [3]:
import sys
from gensim.models import Word2Vec
sentences = [['data', 'science'], ['vidhya', 'science', 'data', 'analytics'],['machine', 'learning'], ['deep', 'learning']]

# train the model on your corpus  
model = Word2Vec(sentences, min_count = 1)

print(model.similarity('data', 'science'))

0.06313193


  


In [None]:
from textblob import TextBlob
train['tweet'][:5].apply(lambda x: str(TextBlob(x).correct()))

In [126]:
dt= '3gpp tr 25913 requirement evolved utra eutra utran eutran v800 jan 2009 pp 120 requirement for evolved utra eutra and utran eutran'

In [127]:
dt1 = '3GPP TR 25.913, "Requirements for Evolved UTRA (E-UTRA) and Evolved UTRAN (E-UTRAN)", V8.0.0 ( Jan. 2009), pp. 1-20.'

In [57]:
from dateparser.search import search_dates
dates = search_dates(dt1)
print(dates)

[('913', datetime.datetime(1900, 1, 1, 9, 1, 3)), ('Jan', datetime.datetime(1900, 1, 1, 0, 0)), ('2009', datetime.datetime(2009, 1, 1, 0, 0)), ('1-20', datetime.datetime(2009, 1, 20, 0, 0))]


In [107]:
import datefinder
import parsedatetime
from datetime import datetime
from dateparser.search import search_dates

ls=[]
matches = datefinder.find_dates(dt1)
for match in matches:
    print([match])

[datetime.datetime(2009, 1, 29, 0, 0)]
[datetime.datetime(2020, 1, 20, 0, 0)]


In [None]:
import re
from datetime import datetime

match = re.search(r'\d{4}-\d{2}-\d{2}', text)
date = datetime.strptime(match.group(), '%Y-%m-%d').date()

In [125]:
import datetime

matches = datefinder.find_dates(dt1)
for match in matches:
    date_time_obj = datetime.datetime.strptime(match, '%Y-%m-%d %H:%M:%S')

TypeError: strptime() argument 1 must be str, not datetime.datetime

In [130]:
all = re.findall(r"[\d]{1,2} [ADFJMNOS]\w* [\d]{4}", dt1)
print(all)

[]


In [120]:
from dateutil.parser import parse
result = parse("Today is 12-01-18", fuzzy_with_tokens=True)
 
# get just the datetime object
result[0]

datetime.datetime(2018, 12, 1, 0, 0)

In [None]:
import datetime
from datetime import date
import re
s = "Jason's birthday is on 1991-09-21"
match = re.search(r'\d{4}-\d{2}-\d{2}', s)
date = datetime.datetime.strptime(match.group(), '%Y-%m-%d').date()
print date
import re
m = re.search('(?<=-)\w+', 'derer-10-12-2001.zip')
print m.group(0)
re.search("([0-9]{2}\-[0-9]{2}\-[0-9]{4})", fileName)
m = re.search('\b(\d{2}-\d{2}-\d{4})\.', 'derer-10-12-2001.zip')
print m.group(1)
re.search(r'(?<=-)[\d-]+(?=\.)', name).group(0)
import re
from datetime import datetime

match = re.search(r'\d{4}-\d{2}-\d{2}', text)
date = datetime.strptime(match.group(), '%Y-%m-%d').date()
In [1]: import dateutil.parser as dparser

In [18]: dparser.parse("monkey 2010-07-10 love banana",fuzzy=True)
Out[18]: datetime.datetime(2010, 7, 10, 0, 0)
    import datefinder

input_string = "monkey 2010-07-10 love banana"
# a generator will be returned by the datefinder module. I'm typecasting it to a list. Please read the note of caution provided at the bottom.
matches = list(datefinder.find_dates(input_string))

if len(matches) > 0:
    # date returned will be a datetime.datetime object. here we are only using the first match.
    date = matches[0]
    print date
else:
    print 'No dates found'
    
from pygrok import Grok

input_string = 'monkey 2010-07-10 love banana'
date_pattern = '%{YEAR:year}-%{MONTHNUM:month}-%{MONTHDAY:day}'

grok = Grok(date_pattern)
print(grok.match(input_string))
fmt_string2 = I want to apply for leaves from 12/12/2017 to 12/18/2017
''.join(fmt_string2.split()[-1].split('.')[::-10])
import re

string = 'I want to apply for leaves from 12/12/2017 to 12/18/2017 I want to apply for leaves from 12 January 2017 to ' \
       '12/18/2017 I want to apply for leaves from 12/12/2017 to 12 Jan 17 '

matches = re.findall('(\d{2}[\/ ](\d{2}|January|Jan|February|Feb|March|Mar|April|Apr|May|May|June|Jun|July|Jul|August|Aug|September|Sep|October|Oct|November|Nov|December|Dec)[\/ ]\d{2,4})', string)

for match in matches:
    print(match[0])
pattern = re.compile(r'from (.*) to (.*)')    
matches = re.findall(pattern, text)
for val in matches:
    try:
        dt_from = parse(val[0])
        dt_to = parse(val[1])

        print("Leave applied from", dt_from.strftime('%d/%b/%Y'), "to", dt_to.strftime('%d/%b/%Y'))
    except ValueError:
        print("skipping", val)
months_list= []
for month_idx in range(1, 13):
    months_list.append(calendar.month_name[month_idx])
    months_list.append(calendar.month_abbr[month_idx])

# join the list to use it as pyparsing keyword
month_keywords = " ".join(months_list)
# date separator - can be one of '/', '.', or ' '
separator = pp.Word("/. ")

# Dictionary for numeric date e.g. 12/12/2018
numeric_date = pp.Combine(pp.Word(pp.nums, max=2) + separator + pp.Word(pp.nums, max=2) + separator + pp.Word(pp.nums, max=4))

# Dictionary for text date e.g. 12/Jan/2018
text_date = pp.Combine(pp.Word(pp.nums, max=2) + separator + pp.oneOf(month_keywords) + separator + pp.Word(pp.nums, max=4))

# Either numeric or text date
date_pattern = numeric_date | text_date

# Final dictionary - from x to y
pattern = pp.Suppress(pp.SkipTo("from") + pp.Word("from") + pp.Optional("start") + pp.Optional("date")) + date_pattern
pattern += pp.Suppress(pp.Word("to") + pp.Optional("end") + pp.Optional("date")) + date_pattern

# Group the pattern, also it can be multiple
pattern = pp.OneOrMore(pp.Group(pattern))
result = pattern.parseString(text)

# Print result
for match in result:
    print("from", match[0], "to", match[1])
exp_date = re.findall(r'exp\w+ date[ :]*\d+[ -/]\d+[ -/]\d+',w.text,re.IGNORECASE)
date, month = re.search("^Date:[^,]+,\ (\d+) (\w+)", Data, re.MULTILINE).groups()
date = int(date)

print(date, month)
import re
from datetime import datetime

data1 = re.sub(' ', '', data)
res = re.search(r'Date(.*)$', data1, re.MULTILINE).group()
res2 = datetime.strptime(res, 'Date:%a,%d%b%Y%X%z')
print(res2.day, res2.month)
import re
from dateutil.parser import parse

date = re.search("Date(.*)$", s, re.MULTILINE)
if date:
    date = date.group().replace("Date:", "").strip()
    d = parse(date)
    Date = d.day
    Month = d.strftime("%b")
print(Date, Month)
import re
s = """Registrar Registration Expiration Date: 10/4/2018
Expiry date: 10/4/2018 """

print(re.findall('Expiration Date:*(.+)|Expiry Date:*(.+)', s, re.IGNORECASE))
 fmt_string2 = I want to apply for leaves from 12/12/2017 to 12/18/2017
''.join(fmt_string2.split()[-1].split('.')[::-10])
import re

string = 'I want to apply for leaves from 12/12/2017 to 12/18/2017 I want to apply for leaves from 12 January 2017 to ' \
       '12/18/2017 I want to apply for leaves from 12/12/2017 to 12 Jan 17 '

matches = re.findall('(\d{2}[\/ ](\d{2}|January|Jan|February|Feb|March|Mar|April|Apr|May|May|June|Jun|July|Jul|August|Aug|September|Sep|October|Oct|November|Nov|December|Dec)[\/ ]\d{2,4})', string)

for match in matches:
    print(match[0])
pattern = re.compile(r'from (.*) to (.*)')    
matches = re.findall(pattern, text)
for val in matches:
    try:
        dt_from = parse(val[0])
        dt_to = parse(val[1])

        print("Leave applied from", dt_from.strftime('%d/%b/%Y'), "to", dt_to.strftime('%d/%b/%Y'))
    except ValueError:
        print("skipping", val)
months_list= []
for month_idx in range(1, 13):
    months_list.append(calendar.month_name[month_idx])
    months_list.append(calendar.month_abbr[month_idx])

# join the list to use it as pyparsing keyword
month_keywords = " ".join(months_list)
# date separator - can be one of '/', '.', or ' '
separator = pp.Word("/. ")

# Dictionary for numeric date e.g. 12/12/2018
numeric_date = pp.Combine(pp.Word(pp.nums, max=2) + separator + pp.Word(pp.nums, max=2) + separator + pp.Word(pp.nums, max=4))

# Dictionary for text date e.g. 12/Jan/2018
text_date = pp.Combine(pp.Word(pp.nums, max=2) + separator + pp.oneOf(month_keywords) + separator + pp.Word(pp.nums, max=4))

# Either numeric or text date
date_pattern = numeric_date | text_date

# Final dictionary - from x to y
pattern = pp.Suppress(pp.SkipTo("from") + pp.Word("from") + pp.Optional("start") + pp.Optional("date")) + date_pattern
pattern += pp.Suppress(pp.Word("to") + pp.Optional("end") + pp.Optional("date")) + date_pattern

# Group the pattern, also it can be multiple
pattern = pp.OneOrMore(pp.Group(pattern))
result = pattern.parseString(text)

# Print result
for match in result:
    print("from", match[0], "to", match[1])
(\d{1,4}([.\-/])\d{1,2}([.\-/])\d{1,4})
all = re.findall(r"[\d]{1,2}/[\d]{1,2}/[\d]{4}", str)


In [None]:
import re

test_cases = ['04/30/2009', '06/20/95', '8/2/69', '1/25/2011', '9/3/2002', '4-13-82', 'Mar-02-2009', 'Jan 20, 1974',
        'March 20, 1990', 'Dec. 21, 2001', 'May 25 2009', '01 Mar 2002', '2 April 2003', '20 Aug. 2004',
        '20 November, 1993', 'Aug 10th, 1994', 'Sept 1st, 2005', 'Feb. 22nd, 1988', 'Sept 2002', 'Sep 2002',
        'December, 1998', 'Oct. 2000', '6/2008', '12/2001', '1998', '2002']

# Create a dictionary to convert from month names to numbers (e.g. Jan = 01)
month_dict = dict(jan='01', feb='02', mar='03', apr='04', may='05', jun='06', jul='07', aug='08', sep='09',
                  oct='10', nov='11', dec='12')


def word_to_num(string):
    """
    This function converts a string to lowercase and only accepts the first three letter.
    This is to prepare a string for month_dict
    Example:
        word_to_num('January') -> jan
    """

    s = string.lower()[:3]
    return month_dict[s]


def date_converter(line):
    """
    This function extracts dates in every format from text and converts them to YYYYMMDD.
    Example:
        date_converter("It was the May 1st, 2009") -> 20090501
    """
    results = []
    day = '01'
    month = '01'
    year = '1900'
    # If format is MM/DD/YYYY or M/D/YY or some combination
    regex = re.search('([0]?\d|[1][0-2])[/-]([0-3]?\d)[/-]([1-2]\d{3}|\d{2})', line)
    # If format is DD Month YYYY or D Mon YY or some combination, also matches if no day given
    month_regex = re.search(
        '([0-3]?\d)\s*(Jan(?:uary)?(?:aury)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|June?|July?|Aug('
        '?:ust)?|Sept?(?:ember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?(?:emeber)?).?,?\s([1-2]\d{3})',
        line)
    # If format is Month/DD/YYYY or Mon/D/YY or or Month DDth, YYYY or some combination
    rev_month_regex = re.search(
        '(Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|June?|July?|Aug(?:ust)?|Sept?(?:ember)?|Oct('
        '?:ober)?|Nov(?:ember)?|Dec(?:ember)?).?[-\s]([0-3]?\d)(?:st|nd|rd|th)?[-,\s]\s*([1-2]\d{3})',
        line)
    # If format is any combination of just Month or Mon and YY or YYYY
    no_day_regex = re.search(
        '(Jan(?:uary)?(?:aury)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|June?|July?|Aug(?:ust)?|Sept?('
        '?:ember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?(?:emeber)?).?,?[\s]([1-2]\d{3}|\d{2})',
        line)
    # If format is MM/YYYY or M YYYY or some combination
    no_day_digits_regex = re.search('([0]?\d|[1][0-2])[/\s]([1-2]\d{3})', line)
    # If format only contains a year. If year is written alone it must be in form YYYY
    year_only_regex = re.search('([1-2]\d{3})', line)
    if regex:
        day = regex.group(2)
        month = regex.group(1)
        year = regex.group(3)
    elif month_regex:
        day = month_regex.group(1)
        month = word_to_num(month_regex.group(2))
        year = month_regex.group(3)
    elif rev_month_regex:
        day = rev_month_regex.group(2)
        month = word_to_num(rev_month_regex.group(1))
        year = rev_month_regex.group(3)
    elif no_day_regex:
        month = word_to_num(no_day_regex.group(1))
        year = no_day_regex.group(2)
    elif no_day_digits_regex:
        month = no_day_digits_regex.group(1)
        year = no_day_digits_regex.group(2)
    elif year_only_regex:
        year = year_only_regex.group(0)
    # Make sure all variables have correct number, add zeros if necessary
    month = month.zfill(2)
    day = day.zfill(2)
    if day == '00':
        day = '01'
    if year is not None and len(year) == 2:
        year = '19' + year
    results.append(year + month + day)
    return results


test_run = [date_converter(w) for w in test_cases]
print(test_run)

from dateutil.parser import parse

test_cases = ['04/30/2009', '06/20/95', '8/2/69', '1/25/2011', '9/3/2002', '4-13-82', 'Mar-02-2009', 'Jan 20, 1974',
              'March 20, 1990', 'Dec. 21, 2001', 'May 25 2009', '01 Mar 2002', '2 April 2003', '20 Aug. 2004',
              '20 November, 1993', 'Aug 10th, 1994', 'Sept 1st, 2005', 'Feb. 22nd, 1988', 'Sept 2002', 'Sep 2002',
              'December, 1998', 'Oct. 2000', '6/2008', '12/2001', '1998', '2002']

for date_string in test_cases:
    print(date_string, parse(date_string).strftime("%Y%m%d"))